diff --git a/.gitignore b/.gitignore
index 900e5a53cbcf3bbb5e00389cca004c49f8600a66..bdcb067fc26d2a18ed88034ab616c08095794e17 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,12 +4,11 @@ node_modules
 /.bazelrc
 /.tf_configure.bazelrc
 /bazel-*
-/third_party/py/numpy/numpy_include
-/tools/bazel.rc
+/bazel_pip
+/third_party/eigen3/mkl_include
+/third_party/mkl/*
 /tools/python_bin_path.sh
 /tools/git/gen
-/util/python/python_include
-/util/python/python_lib
 /pip_test
 /_python_build
 *.pyc
diff --git a/.mention-bot b/.mention-bot
deleted file mode 100644
index 9e4858977f5da2992ccc4053dfbbda3f5f86ee90..0000000000000000000000000000000000000000
--- a/.mention-bot
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "maxReviewers": 2,
-  "numFilesToCheck": 10,
-  "userBlacklist": ["tensorflower-gardener"],
-  "requiredOrgs": ["tensorflow"],
-  "skipAlreadyAssignedPR": true,
-  "skipAlreadyMentionedPR": true,
-  "skipTitle": "Branch",
-  "delayed": true,
-  "delayedUntil": "10m"
-}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5ae5c0fbbcd5b8da7e3f3f98e01f455e0c82e588..c78b6b1a150c98fa379a87f935e77b5803837f11 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,3 +27,140 @@ contributions, often because we probably won't get to them right now. If you
 decide to start on an issue, leave a comment so that other people know that
 you're working on it. If you want to help out, but not alone, use the issue
 comment thread to coordinate.
+
+### Contribution guidelines and standards
+
+Before sending your pull request for
+[review](https://github.com/tensorflow/tensorflow/pulls),
+make sure your changes are consistent with the guidelines and follow the
+TensorFlow coding style.
+
+#### General guidelines and philosophy for contribution
+
+* Include unit tests when you contribute new features, as they help to
+  a) prove that your code works correctly, b) guard against future breaking
+  changes to lower the maintenance cost.
+* Bug fixes also generally require unit tests, because the presence of bugs
+  usually indicates insufficient test coverage.
+* Keep API compatibility in mind when you change code in core TensorFlow,
+  e.g., code in [tensorflow/core](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core) and  [tensorflow/python](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python).
+  TensorFlow has reached version 1 and hence cannot make
+  non-backward-compatible API changes without a major release. Reviewers of your
+  pull request will comment on any API compatibility issues.
+* When you contribute a new feature to TensorFlow, the maintenance burden is (by
+  default) transferred to the TensorFlow team. This means that benefit of
+  contribution must be compared against the cost of maintaining the feature.
+* Full new features (e.g., a new op implementing a cutting-edge algorithm)
+  typically will live in
+  [tensorflow/contrib](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib)
+  to get some airtime before decision is made regarding whether they are to be
+  migrated to the core.
+
+#### License
+
+Include a license at the top of new files.
+
+* [C/C++ license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op.cc#L1)
+* [Python license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/nn.py#L1)
+* [Java license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java/src/main/java/org/tensorflow/Graph.java#L1)
+* [Go license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/go/operation.go#L1)
+* [Bash license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/ci_build/ci_sanity.sh#L2)
+* [HTML license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/dist/index.html#L2)
+* [JavaScript/TypeScript license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/components/tf_backend/backend.ts#L1)
+
+Bazel BUILD files also need to include a license section, e.g.,
+[BUILD example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/BUILD#L61).
+
+#### C++ coding style
+
+Changes to TensorFlow C++ code should conform to
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+
+```bash
+apt-get install -y clang-tidy
+```
+
+You can check a C/C++ file by doing:
+
+
+```bash
+clang-format <my_cc_file> --style=google > /tmp/my_cc_file.cc
+diff <my_cc_file> /tmp/my_cc_file.cc
+```
+
+#### Python coding style
+
+Changes to TensorFlow Python code should conform to
+[Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
+
+Use `pylint` to check your Python changes. To install `pylint` and
+retrieve TensorFlow's custom style definition:
+
+```bash
+pip install pylint
+wget -O /tmp/pylintrc https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/ci_build/pylintrc
+```
+
+To check a file with `pylint`:
+
+```bash
+pylint --rcfile=/tmp/pylintrc myfile.py
+```
+
+#### Coding style for other languages
+
+* [Google Java Style Guide](https://google.github.io/styleguide/javaguide.html)
+* [Google JavaScript Style Guide](https://google.github.io/styleguide/jsguide.html)
+* [Google Shell Style Guide](https://google.github.io/styleguide/shell.xml)
+
+#### Running sanity check
+
+If you have Docker installed on your system, you can perform a sanity check on
+your changes by running the command:
+
+```bash
+tensorflow/tools/ci_build/ci_build.sh CPU tensorflow/tools/ci_build/ci_sanity.sh
+```
+
+This will catch most license, Python coding style and BUILD file issues that
+may exist in your changes.
+
+#### Running unit tests
+
+There are two ways to run TensorFlow unit tests.
+
+1. Using tools and libraries installed directly on your system.
+
+   Refer to the
+   [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel) and
+   [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
+   for the required packages. Alternatively, use the said
+   [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
+   `tensorflow/tensorflow:nightly-devel` and `tensorflow/tensorflow:nightly-devel-gpu`
+   for development to avoid installing the packages directly on your system.
+
+   Once you have the packages installed, you can run a specific unit test in
+   bazel by doing as follows:
+
+   If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
+   the `cuda` option flag
+
+   ```bash
+   export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+
+   export flags="--config=opt --config=cuda -k"
+   ```
+
+   For example, to run all tests under tensorflow/python, do:
+
+   ```bash
+   bazel test ${flags} //tensorflow/python/...
+   ```
+
+2. Using Docker and TensorFlow's CI scripts.
+
+   See
+   [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
+
diff --git a/README.md b/README.md
index ff1124b99d4ace40b520f91a34c5944f21bcf092..2878dab2601351dabbfbcadfbe6a4ae94864ce56 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ guidelines](CONTRIBUTING.md).**
 
 **We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
 tracking requests and bugs, but please see
-[Community](tensorflow/docs_src/about/index.md#community) for general questions
+[Community](https://www.tensorflow.org/community/) for general questions
 and discussion.**
 
 ## Installation
diff --git a/RELEASE.md b/RELEASE.md
index f078d336abb040edd81d7a5ded69f62d409119a4..d4e3bac01c6e250d81fb835a1058fe7316e4e0c2 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -18,6 +18,10 @@
   If at all unsure, first test your code with TF 1.1; ensure it raises no
   errors, and then upgrade to TF 1.2.
 
+## Bug Fixes and Other Changes
+* In python, `Operation.get_attr` on type attributes returns the Python DType
+  version of the type to match expected get_attr documentation rather than the
+  protobuf enum.
 
 # Release 1.1.0
 
diff --git a/WORKSPACE b/WORKSPACE
index cab8389a55ccfeddb9dc077c9b999edbe775f25d..b2d6fb542b0343b52cb7308102eef9478daba242 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "60fc6977908f999b23ca65698c2bb70213403824a84f7904310b6000d78be9ce",
-    strip_prefix = "rules_closure-5ca1dab6df9ad02050f7ba4e816407f88690cf7d",
+    sha256 = "4be8a887f6f38f883236e77bb25c2da10d506f2bf1a8e5d785c0f35574c74ca4",
+    strip_prefix = "rules_closure-aac19edc557aec9b603cd7ffe359401264ceff0d",
     urls = [
-        "http://bazel-mirror.storage.googleapis.com/github.com/bazelbuild/rules_closure/archive/5ca1dab6df9ad02050f7ba4e816407f88690cf7d.tar.gz",  # 2017-02-03
-        "https://github.com/bazelbuild/rules_closure/archive/5ca1dab6df9ad02050f7ba4e816407f88690cf7d.tar.gz",
+        "http://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/aac19edc557aec9b603cd7ffe359401264ceff0d.tar.gz",  # 2017-05-10
+        "https://github.com/bazelbuild/rules_closure/archive/aac19edc557aec9b603cd7ffe359401264ceff0d.tar.gz",
     ],
 )
 
diff --git a/configure b/configure
index fad3fdbebd944e2bb54b719a7f43a8be840fe0ea..308369efd32a2c12e0da5d818b1a704b755bff7f 100755
--- a/configure
+++ b/configure
@@ -35,12 +35,9 @@ function is_windows() {
   fi
 }
 
-function sed_hyphen_i() {
-  if is_macos; then
-    sed -i '' "$@"
-  else
-    sed -i "$@"
-  fi
+function sed_in_place() {
+  sed -e $1 $2 > "$2.bak"
+  mv "$2.bak" $2
 }
 
 function write_to_bazelrc() {
@@ -51,11 +48,126 @@ function write_action_env_to_bazelrc() {
   write_to_bazelrc "build --action_env $1=\"$2\""
 }
 
+function python_path {
+  "$PYTHON_BIN_PATH" - <<END
+from __future__ import print_function
+import site
+import os
+
+try:
+  input = raw_input
+except NameError:
+  pass
+
+python_paths = []
+if os.getenv('PYTHONPATH') is not None:
+  python_paths = os.getenv('PYTHONPATH').split(':')
+try:
+  library_paths = site.getsitepackages()
+except AttributeError:
+ from distutils.sysconfig import get_python_lib
+ library_paths = [get_python_lib()]
+all_paths = set(python_paths + library_paths)
+
+paths = []
+for path in all_paths:
+  if os.path.isdir(path):
+    paths.append(path)
+
+print(",".join(paths))
+END
+}
+
+function setup_python {
+  ## Set up python-related environment settings:
+  while true; do
+    fromuser=""
+    if [ -z "$PYTHON_BIN_PATH" ]; then
+      default_python_bin_path=$(which python || which python3 || true)
+      read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH
+      fromuser="1"
+      if [ -z "$PYTHON_BIN_PATH" ]; then
+        PYTHON_BIN_PATH=$default_python_bin_path
+      fi
+    fi
+    if [ -e "$PYTHON_BIN_PATH" ]; then
+      break
+    fi
+    echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2
+    if [ -z "$fromuser" ]; then
+      exit 1
+    fi
+    PYTHON_BIN_PATH=""
+    # Retry
+  done
+
+  if [ -z "$PYTHON_LIB_PATH" ]; then
+    # Split python_path into an array of paths, this allows path containing spaces
+    IFS=','
+    python_lib_path=($(python_path))
+    unset IFS
+
+    if [ 1 = "$USE_DEFAULT_PYTHON_LIB_PATH" ]; then
+      PYTHON_LIB_PATH=${python_lib_path[0]}
+      echo "Using python library path: $PYTHON_LIB_PATH"
+
+    else
+      echo "Found possible Python library paths:"
+      for x in "${python_lib_path[@]}"; do
+        echo "  $x"
+      done
+      set -- "${python_lib_path[@]}"
+      echo "Please input the desired Python library path to use.  Default is ["$1"]"
+      read b || true
+      if [ "$b" == "" ]; then
+        PYTHON_LIB_PATH=${python_lib_path[0]}
+        echo "Using python library path: $PYTHON_LIB_PATH"
+      else
+        PYTHON_LIB_PATH="$b"
+      fi
+    fi
+  fi
+
+  if [ ! -x "$PYTHON_BIN_PATH" ]  || [ -d "$PYTHON_BIN_PATH" ]; then
+    echo "PYTHON_BIN_PATH is not executable.  Is it the python binary?"
+    exit 1
+  fi
+
+  local python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);')
+  if [ "$python_major_version" == "" ]; then
+    echo -e "\n\nERROR: Problem getting python version.  Is $PYTHON_BIN_PATH the correct python binary?"
+    exit 1
+  fi
+
+  # Convert python path to Windows style before writing into bazel.rc
+  if is_windows; then
+    PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")"
+  fi
+
+  # Set-up env variables used by python_configure.bzl
+  write_action_env_to_bazelrc "PYTHON_BIN_PATH" "$PYTHON_BIN_PATH"
+  write_action_env_to_bazelrc "PYTHON_LIB_PATH" "$PYTHON_LIB_PATH"
+  write_to_bazelrc "build --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "build --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
+  write_to_bazelrc "build --force_python=py$python_major_version"
+  write_to_bazelrc "build --host_force_python=py$python_major_version"
+  write_to_bazelrc "build --python${python_major_version}_path=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "test --force_python=py$python_major_version"
+  write_to_bazelrc "test --host_force_python=py$python_major_version"
+  write_to_bazelrc "test --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "test --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
+  write_to_bazelrc "run --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "run --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
+
+  # Write tools/python_bin_path.sh
+  echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh
+}
+
 # This file contains customized config settings.
 rm -f .tf_configure.bazelrc
 touch .tf_configure.bazelrc
 touch .bazelrc
-sed_hyphen_i "/tf_configure/d" .bazelrc
+sed_in_place "/tf_configure/d" .bazelrc
 echo "import %workspace%/.tf_configure.bazelrc" >> .bazelrc
 
 # Delete any leftover BUILD files from the Makefile build, which would interfere
@@ -65,61 +177,63 @@ if [ -d "${MAKEFILE_DOWNLOAD_DIR}" ]; then
   find ${MAKEFILE_DOWNLOAD_DIR} -type f -name '*BUILD' -delete
 fi
 
-## Set up python-related environment settings
-while true; do
+setup_python
+
+## Set up MKL related environment settings
+while [ "$TF_NEED_MKL" == "" ]; do
   fromuser=""
-  if [ -z "$PYTHON_BIN_PATH" ]; then
-    default_python_bin_path=$(which python || which python3  || true)
-    read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH
-    fromuser="1"
-    if [ -z "$PYTHON_BIN_PATH" ]; then
-      PYTHON_BIN_PATH=$default_python_bin_path
-    fi
-  fi
-  if [ -e "$PYTHON_BIN_PATH" ]; then
-    break
-  fi
-  echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  PYTHON_BIN_PATH=""
-  # Retry
+  read -p "Do you wish to build TensorFlow with MKL support? [y/N] " INPUT
+  fromuser="1"
+  case $INPUT in
+    [Yy]* ) echo "MKL support will be enabled for TensorFlow"; TF_NEED_MKL=1;;
+    [Nn]* ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
+    "" ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
 done
-export PYTHON_BIN_PATH
-write_action_env_to_bazelrc "PYTHON_BIN_PATH" "$PYTHON_BIN_PATH"
-# TODO(ngiraldo): allow the user to optionally set PYTHON_INCLUDE_PATH and NUMPY_INCLUDE_PATH
 
-## Set up MKL related environment settings
-if false; then # Disable building with MKL for now
-  while [ "$TF_NEED_MKL" == "" ]; do
+OSNAME=`uname -s`
+
+if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
+  while [ "$TF_DOWNLOAD_MKL" == "" ]; do
     fromuser=""
-    read -p "Do you wish to build TensorFlow with MKL support (experimental)? [y/N] " INPUT
+    read -p "Do you wish to download MKL LIB from the web? [Y/n] " INPUT
     fromuser="1"
     case $INPUT in
-      [Yy]* ) echo "MKL support (experimental) (will be enabled for TensorFlow"; TF_NEED_MKL=1;;
-      [Nn]* ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
-      "" ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
-      * ) echo "Invalid selection: " $INPUT;;
+      [Yy]* ) TF_DOWNLOAD_MKL=1;;
+      [Nn]* ) TF_DOWNLOAD_MKL=0;;
+      "" )    TF_DOWNLOAD_MKL=1;;
+      * )     echo "Invalid selection: " $INPUT; exit 1;;
     esac
   done
 
-  OSNAME=`uname -s`
-
-  if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
+  if [[ "$TF_DOWNLOAD_MKL" == "1" ]]; then
     DST=`dirname $0`
-    ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170209.tgz
-    GITHUB_RELEASE_TAG=v0.5
+    ARCHIVE_BASENAME=mklml_lnx_2018.0.20170425.tgz
+    GITHUB_RELEASE_TAG=v0.7
     MKLURL="https://github.com/01org/mkl-dnn/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
-    if ! [ -e "$DST/third_party/mkl/$ARCHIVE_BASENAME" ]; then
-      wget --no-check-certificate -P $DST/third_party/mkl/ $MKLURL
+    if ! [ -e "${DST}/third_party/mkl/${ARCHIVE_BASENAME}" ]; then
+      curl -fSsL -o "${DST}/third_party/mkl/${ARCHIVE_BASENAME}" "${MKLURL}"
     fi
     tar -xzf $DST/third_party/mkl/$ARCHIVE_BASENAME -C $DST/third_party/mkl/
     extracted_dir_name="${ARCHIVE_BASENAME%.*}"
     MKL_INSTALL_PATH=$DST/third_party/mkl/$extracted_dir_name
     MKL_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${MKL_INSTALL_PATH}')))"`
 
-    if [ "$OSNAME" == "Linux" ]; then
+  else
+    default_mkl_path=/opt/intel/mklml
+    fromuser=""
+    read -p "Please specify the location where MKL is installed. [Default is $default_mkl_path]: " MKL_INSTALL_PATH
+    fromuser="1"
+    if [ -z "$MKL_INSTALL_PATH" ]; then
+      MKL_INSTALL_PATH=$default_mkl_path
+    fi
+    # Result returned from "read" will be used unexpanded. That make "~" unuseable.
+    # Going through one more level of expansion to handle that.
+    MKL_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${MKL_INSTALL_PATH}')))"`
+  fi
+
+  if [ "$OSNAME" == "Linux" ]; then
       # Full MKL configuration
       MKL_RT_LIB_PATH="lib/intel64/libmkl_rt.so" #${TF_MKL_EXT}#TODO version?
       MKL_RT_OMP_LIB_PATH="../compiler/lib/intel64/libiomp5.so" #TODO VERSION?
@@ -127,24 +241,29 @@ if false; then # Disable building with MKL for now
       # MKL-ML configuration
       MKL_ML_LIB_PATH="lib/libmklml_intel.so" #${TF_MKL_EXT}#TODO version?
       MKL_ML_OMP_LIB_PATH="lib/libiomp5.so" #TODO VERSION?
-    elif [ "$OSNAME" == "Darwin" ]; then
+  elif [ "$OSNAME" == "Darwin" ]; then
       echo "Darwin is unsupported yet";
       exit 1
-    fi
+  fi
 
-    if [ -e "$MKL_INSTALL_PATH/${MKL_ML_LIB_PATH}" ]; then
+  if [ -e "$MKL_INSTALL_PATH/${MKL_ML_LIB_PATH}" ]; then
       ln -sf $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} third_party/mkl/
       ln -sf $MKL_INSTALL_PATH/${MKL_ML_OMP_LIB_PATH} third_party/mkl/
       ln -sf $MKL_INSTALL_PATH/include third_party/mkl/
       ln -sf $MKL_INSTALL_PATH/include third_party/eigen3/mkl_include
-    else
-      echo "ERROR: $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} does not exist";
-      exit 1
-    fi
-
-    if [ -z "$fromuser" ]; then
+      loc=$(locate -e libdl.so.2 | sed -n 1p)
+      ln -sf $loc third_party/mkl/libdl.so.2
+  elif [ -e "$MKL_INSTALL_PATH/${MKL_RT_LIB_PATH}" ]; then
+      ln -sf $MKL_INSTALL_PATH/${MKL_RT_LIB_PATH} third_party/mkl/
+      ln -sf $MKL_INSTALL_PATH/${MKL_RT_OMP_LIB_PATH} third_party/mkl/
+      ln -sf $MKL_INSTALL_PATH/include third_party/mkl/
+      ln -sf $MKL_INSTALL_PATH/include third_party/eigen3/mkl_include
+      loc=$(locate -e libdl.so.2 | sed -n 1p)
+      ln -sf $loc third_party/mkl/libdl.so.2
+  else
+      echo "ERROR: $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} nor $MKL_INSTALL_PATH/${MKL_RT_LIB_PATH} exists";
       exit 1
-    fi
+  fi
 
 cat > third_party/mkl/mkl.config <<EOF
 # MKL_INSTALL_PATH refers to the location of MKL root folder. The MKL header and library
@@ -152,9 +271,8 @@ cat > third_party/mkl/mkl.config <<EOF
 MKL_INSTALL_PATH=$MKL_INSTALL_PATH
 EOF
 
-  fi # TF_NEED_MKL
-  ################## MKL
-fi # Disable building with MKL for now
+fi # TF_NEED_MKL
+## End MKL setup
 
 ## Set up architecture-dependent optimization flags.
 if [ -z "$CC_OPT_FLAGS" ]; then
@@ -263,13 +381,9 @@ if [[ "$TF_NEED_VERBS" == "1" ]]; then
   write_to_bazelrc 'build --define with_verbs_support=true'
 fi
 
-# Invoke python_config and set up symlinks to python includes
-./util/python/python_config.sh "$PYTHON_BIN_PATH"
-
 # Append CC optimization flags to bazel.rc
-echo >> tools/bazel.rc
 for opt in $CC_OPT_FLAGS; do
-  echo "build:opt --cxxopt=$opt --copt=$opt" >> tools/bazel.rc
+  write_to_bazelrc "build:opt --cxxopt=$opt --copt=$opt"
 done
 
 # Run the gen_git_source to create links where bazel can track dependencies for
@@ -321,31 +435,6 @@ done
 export TF_CUDA_CLANG
 write_action_env_to_bazelrc "TF_CUDA_CLANG" "$TF_CUDA_CLANG"
 
-# Set up which gcc nvcc should use as the host compiler
-# No need to set this on Windows
-while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
-  fromuser=""
-  if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-    default_gcc_host_compiler_path=$(which gcc || true)
-    read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
-    fromuser="1"
-    if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
-    fi
-  fi
-  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
-    export GCC_HOST_COMPILER_PATH
-    write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
-    break
-  fi
-  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  GCC_HOST_COMPILER_PATH=""
-  # Retry
-done
-
 # Set up which clang we should use as the cuda / host compiler.
 while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do
   fromuser=""
@@ -386,6 +475,11 @@ while true; do
       else
         default_cuda_path="$(cygpath -m "$CUDA_PATH")"
       fi
+    elif is_linux; then
+      # If the default doesn't exist, try an alternative default.
+      if [ ! -d $default_cuda_path ] && [ -d /opt/cuda ]; then
+        default_cuda_path=/opt/cuda
+      fi
     fi
     read -p "Please specify the location where CUDA $TF_CUDA_VERSION toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH
     fromuser="1"
@@ -425,6 +519,35 @@ while true; do
   CUDA_TOOLKIT_PATH=""
 done
 
+# Set up which gcc nvcc should use as the host compiler
+# No need to set this on Windows
+while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
+  fromuser=""
+  if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
+    default_gcc_host_compiler_path=$(which gcc || true)
+    cuda_bin_symlink="$CUDA_TOOLKIT_PATH/bin/gcc"
+    if [ -L "$cuda_bin_symlink" ]; then
+      default_gcc_host_compiler_path=$(readlink $cuda_bin_symlink)
+    fi
+    read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
+    fromuser="1"
+    if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
+      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
+    fi
+  fi
+  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
+    export GCC_HOST_COMPILER_PATH
+    write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
+    break
+  fi
+  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  GCC_HOST_COMPILER_PATH=""
+  # Retry
+done
+
 # Find out where the cuDNN library is installed
 while true; do
   # Configure the cuDNN version to use.
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index cca5a80314387ae0e6a33d3f318d03c49f5d8b0e..e3b88291057e6bec39a5c23e00d96152960c2bbf 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -202,7 +202,6 @@ filegroup(
         "//tensorflow/compiler/xla/client:all_files",
         "//tensorflow/compiler/xla/client/lib:all_files",
         "//tensorflow/compiler/xla/legacy_flags:all_files",
-        "//tensorflow/compiler/xla/port:all_files",
         "//tensorflow/compiler/xla/service:all_files",
         "//tensorflow/compiler/xla/service/cpu:all_files",
         "//tensorflow/compiler/xla/service/gpu:all_files",
@@ -213,6 +212,7 @@ filegroup(
         "//tensorflow/contrib:all_files",
         "//tensorflow/contrib/android:all_files",
         "//tensorflow/contrib/batching:all_files",
+        "//tensorflow/contrib/batching/kernels:all_files",
         "//tensorflow/contrib/batching/test_util:all_files",
         "//tensorflow/contrib/batching/util:all_files",
         "//tensorflow/contrib/bayesflow:all_files",
@@ -262,6 +262,7 @@ filegroup(
         "//tensorflow/contrib/seq2seq:all_files",
         "//tensorflow/contrib/session_bundle:all_files",
         "//tensorflow/contrib/session_bundle/example:all_files",
+        "//tensorflow/contrib/signal:all_files",
         "//tensorflow/contrib/slim:all_files",
         "//tensorflow/contrib/slim/python/slim/data:all_files",
         "//tensorflow/contrib/slim/python/slim/nets:all_files",
@@ -289,6 +290,7 @@ filegroup(
         "//tensorflow/core/grappler/costs:all_files",
         "//tensorflow/core/grappler/inputs:all_files",
         "//tensorflow/core/grappler/optimizers:all_files",
+        "//tensorflow/core/grappler/utils:all_files",
         "//tensorflow/core/kernels:all_files",
         "//tensorflow/core/kernels/hexagon:all_files",
         "//tensorflow/core/ops/compat:all_files",
@@ -298,6 +300,7 @@ filegroup(
         "//tensorflow/core/util/ctc:all_files",
         "//tensorflow/core/util/tensor_bundle:all_files",
         "//tensorflow/examples/android:all_files",
+        "//tensorflow/examples/benchmark:all_files",
         "//tensorflow/examples/how_tos/reading_data:all_files",
         "//tensorflow/examples/image_retraining:all_files",
         "//tensorflow/examples/label_image:all_files",
@@ -314,7 +317,10 @@ filegroup(
         "//tensorflow/python:all_files",
         "//tensorflow/python/debug:all_files",
         "//tensorflow/python/estimator:all_files",
+        "//tensorflow/python/feature_column:all_files",
         "//tensorflow/python/kernel_tests:all_files",
+        "//tensorflow/python/kernel_tests/distributions:all_files",
+        "//tensorflow/python/ops/distributions:all_files",
         "//tensorflow/python/saved_model:all_files",
         "//tensorflow/python/tools:all_files",
         "//tensorflow/tensorboard:all_files",
@@ -322,6 +328,100 @@ filegroup(
         "//tensorflow/tensorboard/backend:all_files",
         "//tensorflow/tensorboard/backend/event_processing:all_files",
         "//tensorflow/tensorboard/components:all_files",
+        "//tensorflow/tensorboard/components/tf_audio_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_audio_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_audio_dashboard/demo/data:all_files",
+        "//tensorflow/tensorboard/components/tf_audio_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_backend:all_files",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/tf_color_scale:all_files",
+        "//tensorflow/tensorboard/components/tf_color_scale/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/tf_dashboard_common:all_files",
+        "//tensorflow/tensorboard/components/tf_dashboard_common/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data:all_files",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_globals:all_files",
+        "//tensorflow/tensorboard/components/tf_globals_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph:all_files",
+        "//tensorflow/tensorboard/components/tf_graph/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_app:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_app/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_app_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_app_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_board:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_board/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_common:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_controls:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_controls/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_info:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_info/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_info_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_info_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_loader:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_loader/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data:all_files",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_image_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_image_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_image_dashboard/demo/data:all_files",
+        "//tensorflow/tensorboard/components/tf_image_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_imports:all_files",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_option_selector_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data:all_files",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_storage:all_files",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/tf_tensorboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_text_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_text_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_text_dashboard/demo/data:all_files",
+        "//tensorflow/tensorboard/components/tf_text_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_data_summary:all_files",
+        "//tensorflow/tensorboard/components/vz_distribution_chart:all_files",
+        "//tensorflow/tensorboard/components/vz_distribution_chart/demo:all_files",
+        "//tensorflow/tensorboard/components/vz_distribution_chart_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries:all_files",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries/demo:all_files",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_line_chart:all_files",
+        "//tensorflow/tensorboard/components/vz_line_chart/demo:all_files",
+        "//tensorflow/tensorboard/components/vz_line_chart_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_projector:all_files",
+        "//tensorflow/tensorboard/components/vz_projector_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_projector_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/vz_sorting:all_files",
+        "//tensorflow/tensorboard/components/vz_sorting/test:all_files",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4/test:all_files",
+        "//tensorflow/tensorboard/demo:all_files",
+        "//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:all_files",
         "//tensorflow/tensorboard/lib:all_files",
         "//tensorflow/tensorboard/plugins:all_files",
         "//tensorflow/tensorboard/plugins/projector:all_files",
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 4ad69ae3fbdfbb8e3ab3c868fea4976c59dd9e71..3ab4e8efcdb5b05cf8922edd302e7cbf3a3597f1 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -58,6 +58,7 @@ tf_cuda_library(
             "//tensorflow/cc/saved_model:loader",
             "//tensorflow/cc:gradients",
             "//tensorflow/cc:ops",
+            "//tensorflow/cc:grad_ops",
             "//tensorflow/cc:scope_internal",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:framework",
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 0f66a47b4ad66ecfbbdfbed15dddb378c62308b9..f4775783f9f88c941445b62603c92cae00d34715 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -738,8 +738,7 @@ tensorflow::string OutputName(const TF_Output& output) {
 const tensorflow::AttrValue* GetAttrValue(TF_Operation* oper,
                                           const char* attr_name,
                                           TF_Status* status) {
-  const tensorflow::AttrValue* attr =
-      tensorflow::AttrSlice(oper->node.def()).Find(attr_name);
+  const tensorflow::AttrValue* attr = oper->node.attrs().Find(attr_name);
   if (attr == nullptr) {
     status->status =
         InvalidArgument("Operation has no attr named '", attr_name, "'.");
@@ -1101,14 +1100,14 @@ static TF_Operation* TF_FinishOperationLocked(TF_OperationDescription* desc,
 
     if (status->status.ok()) {
       // Run shape inference function for newly added node.
-      //
-      // TODO(b/28152992): Enable returning the result of this
-      // code-path once we have converted all python shape functions
-      // to call their C++ versions.
-      desc->graph->refiner.AddNode(ret).IgnoreError();
-
+      status->status = desc->graph->refiner.AddNode(ret);
+    }
+    if (status->status.ok()) {
       // Add the node to the name-to-node mapping.
       desc->graph->name_map[ret->name()] = ret;
+    } else if (ret != nullptr) {
+      desc->graph->graph.RemoveNode(ret);
+      ret = nullptr;
     }
   }
 
@@ -1135,7 +1134,7 @@ const char* TF_OperationOpType(TF_Operation* oper) {
 }
 
 const char* TF_OperationDevice(TF_Operation* oper) {
-  return oper->node.def().device().c_str();
+  return oper->node.requested_device().c_str();
 }
 
 int TF_OperationNumOutputs(TF_Operation* oper) {
@@ -1150,8 +1149,8 @@ TF_DataType TF_OperationOutputType(TF_Output oper_out) {
 int TF_OperationOutputListLength(TF_Operation* oper, const char* arg_name,
                                  TF_Status* status) {
   NameRangeMap name_ranges;
-  status->status = NameRangesForNode(oper->node.def(), oper->node.op_def(),
-                                     nullptr, &name_ranges);
+  status->status =
+      NameRangesForNode(oper->node, oper->node.op_def(), nullptr, &name_ranges);
   if (!status->status.ok()) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
@@ -1172,8 +1171,8 @@ TF_DataType TF_OperationInputType(TF_Input oper_in) {
 int TF_OperationInputListLength(TF_Operation* oper, const char* arg_name,
                                 TF_Status* status) {
   NameRangeMap name_ranges;
-  status->status = NameRangesForNode(oper->node.def(), oper->node.op_def(),
-                                     &name_ranges, nullptr);
+  status->status =
+      NameRangesForNode(oper->node, oper->node.op_def(), &name_ranges, nullptr);
   if (!status->status.ok()) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
@@ -1411,26 +1410,27 @@ void TF_OperationGetAttrStringList(TF_Operation* oper, const char* attr_name,
   }
 }
 
-#define DEFINE_GETATTR(func, c_type, cpp_type, list_field)                     \
-  void func(TF_Operation* oper, const char* attr_name, c_type* value,          \
-            TF_Status* status) {                                               \
-    cpp_type v;                                                                \
-    status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &v); \
-    *value = static_cast<c_type>(v);                                           \
-  }                                                                            \
-  void func##List(TF_Operation* oper, const char* attr_name, c_type* values,   \
-                  int max_values, TF_Status* status) {                         \
-    const auto* attr = GetAttrValue(oper, attr_name, status);                  \
-    if (!status->status.ok()) return;                                          \
-    if (attr->value_case() != tensorflow::AttrValue::kList) {                  \
-      status->status =                                                         \
-          InvalidArgument("Value for '", attr_name, "' is not a list.");       \
-      return;                                                                  \
-    }                                                                          \
-    const auto len = std::min(max_values, attr->list().list_field##_size());   \
-    for (int i = 0; i < len; ++i) {                                            \
-      values[i] = static_cast<c_type>(attr->list().list_field(i));             \
-    }                                                                          \
+#define DEFINE_GETATTR(func, c_type, cpp_type, list_field)                   \
+  void func(TF_Operation* oper, const char* attr_name, c_type* value,        \
+            TF_Status* status) {                                             \
+    cpp_type v;                                                              \
+    status->status =                                                         \
+        tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &v);          \
+    *value = static_cast<c_type>(v);                                         \
+  }                                                                          \
+  void func##List(TF_Operation* oper, const char* attr_name, c_type* values, \
+                  int max_values, TF_Status* status) {                       \
+    const auto* attr = GetAttrValue(oper, attr_name, status);                \
+    if (!status->status.ok()) return;                                        \
+    if (attr->value_case() != tensorflow::AttrValue::kList) {                \
+      status->status =                                                       \
+          InvalidArgument("Value for '", attr_name, "' is not a list.");     \
+      return;                                                                \
+    }                                                                        \
+    const auto len = std::min(max_values, attr->list().list_field##_size()); \
+    for (int i = 0; i < len; ++i) {                                          \
+      values[i] = static_cast<c_type>(attr->list().list_field(i));           \
+    }                                                                        \
   }
 DEFINE_GETATTR(TF_OperationGetAttrInt, int64_t, tensorflow::int64, i);
 DEFINE_GETATTR(TF_OperationGetAttrFloat, float, float, f);
@@ -1441,7 +1441,8 @@ DEFINE_GETATTR(TF_OperationGetAttrType, TF_DataType, DataType, type);
 void TF_OperationGetAttrShape(TF_Operation* oper, const char* attr_name,
                               int64_t* value, int num_dims, TF_Status* status) {
   PartialTensorShape shape;
-  status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &shape);
+  status->status =
+      tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shape);
   if (!status->status.ok()) return;
   auto len = std::min(shape.dims(), num_dims);
   for (int i = 0; i < len; ++i) {
@@ -1455,7 +1456,7 @@ void TF_OperationGetAttrShapeList(TF_Operation* oper, const char* attr_name,
                                   int storage_size, TF_Status* status) {
   std::vector<PartialTensorShape> shapes;
   status->status =
-      tensorflow::GetNodeAttr(oper->node.def(), attr_name, &shapes);
+      tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shapes);
   if (!status->status.ok()) return;
   auto len = std::min(static_cast<int>(shapes.size()), max_values);
   int64_t* p = storage;
@@ -1522,7 +1523,7 @@ void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
                                TF_Tensor** value, TF_Status* status) {
   *value = nullptr;
   Tensor t;
-  status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &t);
+  status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &t);
   if (!status->status.ok()) return;
   *value = new TF_Tensor{static_cast<TF_DataType>(t.dtype()), t.shape(),
                          tensorflow::TensorCApi::Buffer(t)};
@@ -1533,7 +1534,7 @@ void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
                                    TF_Tensor** values, int max_values,
                                    TF_Status* status) {
   std::vector<Tensor> ts;
-  status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &ts);
+  status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &ts);
   if (!status->status.ok()) return;
   const auto len = std::min(max_values, static_cast<int>(ts.size()));
   for (int i = 0; i < len; ++i) {
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index e2aeef0d88f87f0e1567db81576c8639fe82b01b..ec9b01b388d1138644e28e3206e32726347b3d5e 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -95,7 +95,7 @@ TF_CAPI_EXPORT extern const char* TF_Version();
 // --------------------------------------------------------------------------
 // TF_DataType holds the type for a scalar value.  E.g., one slot in a tensor.
 // The enum values here are identical to corresponding values in types.proto.
-typedef enum {
+typedef enum TF_DataType {
   TF_FLOAT = 1,
   TF_DOUBLE = 2,
   TF_INT32 = 3,  // Int32 tensors are always in 'host' memory.
@@ -127,7 +127,7 @@ TF_CAPI_EXPORT extern size_t TF_DataTypeSize(TF_DataType dt);
 // --------------------------------------------------------------------------
 // TF_Code holds an error code.  The enum values here are identical to
 // corresponding values in error_codes.proto.
-typedef enum {
+typedef enum TF_Code {
   TF_OK = 0,
   TF_CANCELLED = 1,
   TF_UNKNOWN = 2,
@@ -629,7 +629,7 @@ TF_CAPI_EXPORT extern int TF_OperationGetControlOutputs(
     int max_control_outputs);
 
 // TF_AttrType describes the type of the value of an attribute on an operation.
-typedef enum {
+typedef enum TF_AttrType {
   TF_ATTR_STRING = 0,
   TF_ATTR_INT = 1,
   TF_ATTR_FLOAT = 2,
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 0ddc59db20e6d8cf08f37155431285b69c625302..cdb7406c86e8b10d24c303615d13089272bcab5d 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -278,6 +279,19 @@ static void Int32Deallocator(void* data, size_t, void* arg) {
   delete[] static_cast<int32*>(data);
 }
 
+// Create a tensor with values of type TF_INT8 provided by `values`.
+static TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims,
+                             const char* values) {
+  int64_t num_values = 1;
+  for (int i = 0; i < num_dims; ++i) {
+    num_values *= dims[i];
+  }
+  TF_Tensor* t =
+      TF_AllocateTensor(TF_INT8, dims, num_dims, sizeof(char) * num_values);
+  memcpy(TF_TensorData(t), values, sizeof(char) * num_values);
+  return t;
+}
+
 static TF_Tensor* Int32Tensor(int32 v) {
   const int num_bytes = sizeof(int32);
   int32* values = new int32[1];
@@ -293,16 +307,21 @@ TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s,
   return TF_FinishOperation(desc, s);
 }
 
-TF_Operation* ScalarConst(int32 v, TF_Graph* graph, TF_Status* s,
-                          const char* name = "scalar") {
-  unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor);
+TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
+                    const char* name = "const") {
   TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name);
-  TF_SetAttrTensor(desc, "value", tensor.get(), s);
+  TF_SetAttrTensor(desc, "value", t, s);
   if (TF_GetCode(s) != TF_OK) return nullptr;
-  TF_SetAttrType(desc, "dtype", TF_INT32);
+  TF_SetAttrType(desc, "dtype", TF_TensorType(t));
   return TF_FinishOperation(desc, s);
 }
 
+TF_Operation* ScalarConst(int32 v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar") {
+  unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor);
+  return Const(tensor.get(), graph, s, name);
+}
+
 TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                   TF_Status* s, const char* name = "add") {
   TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
@@ -1093,6 +1112,35 @@ TEST(CAPI, SessionPRun) {
   TF_DeleteStatus(s);
 }
 
+TEST(CAPI, ShapeInferenceError) {
+  // TF_FinishOperation should fail if the shape of the added operation cannot
+  // be inferred.
+  TF_Status* status = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // Create this failure by trying to add two nodes with incompatible shapes
+  // (A tensor with shape [2] and a tensor with shape [3] cannot be added).
+  const char data[] = {1, 2, 3};
+  const int64_t vec2_dims[] = {2};
+  unique_tensor_ptr vec2_tensor(
+      Int8Tensor(vec2_dims, TF_ARRAYSIZE(vec2_dims), data), TF_DeleteTensor);
+  TF_Operation* vec2 = Const(vec2_tensor.get(), graph, status, "vec2");
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  const int64_t vec3_dims[] = {3};
+  unique_tensor_ptr vec3_tensor(
+      Int8Tensor(vec3_dims, TF_ARRAYSIZE(vec3_dims), data), TF_DeleteTensor);
+  TF_Operation* vec3 = Const(vec3_tensor.get(), graph, status, "vec3");
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Operation* add = Add(vec2, vec3, graph, status);
+  ASSERT_NE(TF_OK, TF_GetCode(status));
+  ASSERT_TRUE(add == nullptr);
+
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(status);
+}
+
 TEST(CAPI, ColocateWith) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -1535,7 +1583,8 @@ Test op with no grad registered.
 
 x: input
 y: output
-)doc");
+)doc")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape);
 
 class CApiGradientsTest : public ::testing::Test {
  protected:
@@ -1801,18 +1850,6 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) {
   TestGradientsError(false);
 }
 
-// Create a tensor with values of type TF_INT8 provided by `values`.
-TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) {
-  int64_t num_values = 1;
-  for (int i = 0; i < num_dims; ++i) {
-    num_values *= dims[i];
-  }
-  TF_Tensor* t =
-      TF_AllocateTensor(TF_INT8, dims, num_dims, sizeof(char) * num_values);
-  memcpy(TF_TensorData(t), values, sizeof(char) * num_values);
-  return t;
-}
-
 void StringVectorToArrays(const std::vector<string>& v,
                           std::unique_ptr<const void* []>* ptrs,
                           std::unique_ptr<size_t[]>* lens) {
@@ -1828,9 +1865,13 @@ void StringVectorToArrays(const std::vector<string>& v,
 // Registers two ops, each with a single attribute called 'v'.
 // The attribute in one op will have a type 'type', the other
 // will have list(type).
-#define ATTR_TEST_REGISTER_OP(type)                            \
-  REGISTER_OP("CApiAttributesTestOp" #type).Attr("v: " #type); \
-  REGISTER_OP("CApiAttributesTestOpList" #type).Attr("v: list(" #type ")")
+#define ATTR_TEST_REGISTER_OP(type)                           \
+  REGISTER_OP("CApiAttributesTestOp" #type)                   \
+      .Attr("v: " #type)                                      \
+      .SetShapeFn(tensorflow::shape_inference::UnknownShape); \
+  REGISTER_OP("CApiAttributesTestOpList" #type)               \
+      .Attr("v: list(" #type ")")                             \
+      .SetShapeFn(tensorflow::shape_inference::UnknownShape)
 ATTR_TEST_REGISTER_OP(string);
 ATTR_TEST_REGISTER_OP(int);
 ATTR_TEST_REGISTER_OP(float);
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ea2eed011c62e535047e5f40d1f5b34fbb6ad2be
--- /dev/null
+++ b/tensorflow/c/generate-pc.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+TF_PREFIX='/usr/local'
+
+usage() {
+    echo "Usage: $0 OPTIONS"
+    echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-v, --version\tset TensorFlow version"
+    echo -e "-h, --help\tdisplay this message"
+}
+
+# read the options
+ARGS=`getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@"`
+eval set -- "$ARGS"
+
+# extract options and their arguments into variables.
+while true ; do
+    case "$1" in
+        -h|--help) usage ; exit ;;
+        -p|--prefix)
+            case "$2" in
+                "") shift 2 ;;
+                *) TF_PREFIX=$2 ; shift 2 ;;
+            esac ;;
+        -v|--version)
+            case "$2" in
+                "") shift 2 ;;
+                *) TF_VERSION=$2 ; shift 2 ;;
+            esac ;;
+        --) shift ; echo "Try '$0 --help' for more information."; exit 1 ;;
+        *) echo "Internal error! Try '$0 --help' for more information." ; exit 1 ;;
+    esac
+done
+
+echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
+
+cat << EOF > tensorflow.pc
+prefix=${TF_PREFIX}
+exec_prefix=\${prefix}
+libdir=\${exec_prefix}/lib
+includedir=\${prefix}/include
+
+Name: TensorFlow
+Version: ${TF_VERSION}
+Description: Library for computation using data flow graphs for scalable machine learning
+Requires:
+Libs: -L\${libdir} -ltensorflow
+Cflags: -I\${includedir}
+EOF
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 42fa139282a524f761dbebb2b55cf1ae043526e5..8d4260a0b9ca38593a912398e8460d826fb31ccf 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -91,6 +91,7 @@ cc_library(
     deps = [
         ":array_grad",
         ":math_grad",
+        ":nn_grad",
     ],
 )
 
@@ -388,6 +389,16 @@ tf_gen_op_wrappers_cc(
     visibility = ["//tensorflow:internal"],
 )
 
+tf_gen_op_wrappers_cc(
+    name = "functional_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "functional_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
 tf_gen_op_wrappers_cc(
     name = "resource_variable_ops",
     include_internal_ops = 1,
diff --git a/tensorflow/cc/client/client_session.cc b/tensorflow/cc/client/client_session.cc
index 2732f3f5010d7522a1cf8631183e9b4df7ac86d8..2879445441d0a80c1320a30976412b416feaecc9 100644
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/cc/client/client_session.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/platform/env.h"
@@ -31,7 +32,7 @@ class ClientSession::Impl {
   friend class ClientSession;
 
   Impl(Session* session, std::shared_ptr<Graph> graph)
-      : session_(session), graph_(graph) {}
+      : session_(session), graph_(std::move(graph)) {}
 
   static SessionOptions MakeDefaultSessionOptions(const string& target);
   Status MaybeExtendGraph() const;
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index b7e9948e9d4f3ed3e655802fce4d1febcf68c07f..71aa986f918de68822d457422f6c7a73d6253819 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -198,7 +198,7 @@ string PrintTensorProto(const TensorProto& proto) {
                          ").AsTensorProto()");
 }
 
-string PrintAttrValue(string op, const AttrValue& attr_value) {
+string PrintAttrValue(const string& op, const AttrValue& attr_value) {
   switch (attr_value.value_case()) {
     case AttrValue::kS:
       return PrintString(attr_value.s());
@@ -740,11 +740,10 @@ void OpInfo::GetOutput(string* out) const {
     return;
   }
   strings::StrAppend(out, "  ::tensorflow::NameRangeMap _outputs_range;\n");
-  strings::StrAppend(
-      out,
-      "  ::tensorflow::Status _status_ = "
-      "::tensorflow::NameRangesForNode(ret->def(), ret->op_def(), "
-      "nullptr, &_outputs_range);\n");
+  strings::StrAppend(out,
+                     "  ::tensorflow::Status _status_ = "
+                     "::tensorflow::NameRangesForNode(*ret, ret->op_def(), "
+                     "nullptr, &_outputs_range);\n");
   strings::StrAppend(out, "  if (!_status_.ok()) {\n", "    ", scope_str,
                      ".UpdateStatus(_status_);\n", "    return;\n");
   strings::StrAppend(out, "  }\n\n");
diff --git a/tensorflow/cc/framework/cc_ops_test.cc b/tensorflow/cc/framework/cc_ops_test.cc
index 6dc0d84c16d5b534341575b384997cc398c80bec..5da23036eaadbef270ba839357dc4613bf3bf490 100644
--- a/tensorflow/cc/framework/cc_ops_test.cc
+++ b/tensorflow/cc/framework/cc_ops_test.cc
@@ -32,10 +32,11 @@ Output Linear(const Scope& scope, Input x, Input w, Input b) {
   return BiasAdd(cop_scopes.last, m, b);
 }
 
-void GetColocationConstraints(Output tensor, std::vector<string>* constraints) {
+void GetColocationConstraints(const Output& tensor,
+                              std::vector<string>* constraints) {
   constraints->clear();
-  TF_EXPECT_OK(
-      GetNodeAttr(tensor.op().node()->def(), kColocationAttrName, constraints));
+  TF_EXPECT_OK(GetNodeAttr(tensor.op().node()->attrs(), kColocationAttrName,
+                           constraints));
 }
 
 }  // namespace
@@ -158,11 +159,11 @@ TEST(CCOpTest, KernelLabel) {
   Scope root = Scope::NewRootScope();
   auto add = Add(root.WithKernelLabel("AddWithKernelLabel"), 1.0f, 2.0f);
   TF_EXPECT_OK(root.status());
-  const auto& attrs = add.z.op().node()->def().attr();
-  ASSERT_TRUE(attrs.find("_kernel") != attrs.end());
-  auto kernel_attr = attrs.find("_kernel")->second;
-  TF_EXPECT_OK(AttrValueHasType(kernel_attr, "string"));
-  EXPECT_EQ(kernel_attr.s(), "AddWithKernelLabel");
+  AttrSlice attrs = add.z.op().node()->attrs();
+  const auto* kernel_attr = attrs.Find("_kernel");
+  ASSERT_TRUE(kernel_attr);
+  TF_EXPECT_OK(AttrValueHasType(*kernel_attr, "string"));
+  EXPECT_EQ(kernel_attr->s(), "AddWithKernelLabel");
 }
 
 TEST(CCOpTest, ColocateWith) {
@@ -189,8 +190,7 @@ TEST(CCOpTest, ColocateWith) {
 
   Scope with_colocate = root.ColocateWith(c3).ColocateWith(c4);
   auto c6 = Const(with_colocate.WithOpName("c6").ClearColocation(), 7);
-  const auto& attrs = c6.op().node()->def().attr();
-  EXPECT_TRUE(attrs.find("_class") == attrs.end());
+  EXPECT_FALSE(c6.op().node()->attrs().Find("_class"));
 }
 
 TEST(CCOpTest, TemplatedConst) {
diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc
index 7783bdce3a7ef72ed157d620bf43517af79e1aaf..6a249825812b4d39b55f7170a35436b6ae88c020 100644
--- a/tensorflow/cc/framework/gradients_test.cc
+++ b/tensorflow/cc/framework/gradients_test.cc
@@ -260,7 +260,7 @@ TEST_F(GradientsTest, StackUnstack_StopBackprop) {
 }
 
 TEST_F(GradientsTest, DependentGradOutputs) {
-  // Tests that dependant gradients (in this case the gradients w.r.t to the
+  // Tests that dependent gradients (in this case the gradients w.r.t to the
   // output and one input of MatMul) are computed properly.
 
   // Create two chained MatMul ops.
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 8b7fc1406f06e80590a98c65dd79be858b21cc0d..32c0822de69da7989ceaa4028539db928b6fcea3 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -271,9 +271,9 @@ Scope::Impl::Impl(const Scope& other, Tags::Colocate,
 std::unordered_set<string> Scope::Impl::GetColocationConstraints(
     const Operation& colocate_with_op) const {
   std::unordered_set<string> current_constraints(colocation_constraints_);
-  const NodeDef& node_def = colocate_with_op.node()->def();
+  const AttrSlice attrs = colocate_with_op.node()->attrs();
   std::vector<string> node_constraints;
-  if (GetNodeAttr(node_def, kColocationAttrName, &node_constraints).ok()) {
+  if (GetNodeAttr(attrs, kColocationAttrName, &node_constraints).ok()) {
     for (const string& entry : node_constraints) {
       StringPiece s(entry);
       if (s.Consume(kColocationGroupPrefix)) {
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index 26abd2438e652f29a1d25caf689ab0606a12b00a..37f07e71a0dff9144f193679bbcfcf581c1538cf 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -43,9 +43,9 @@ Status PackGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
   int N;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "N", &N));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "N", &N));
   int axis;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "axis", &axis));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
 
   grad_outputs->reserve(N);
   auto grad_op = Unstack(scope, grad_inputs[0], N, Unstack::Axis(axis));
@@ -60,7 +60,7 @@ Status UnpackGrad(const Scope& scope, const Operation& op,
                   const std::vector<Output>& grad_inputs,
                   std::vector<Output>* grad_outputs) {
   int axis;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "axis", &axis));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
   grad_outputs->push_back(Stack(scope, grad_inputs, Stack::Axis(axis)));
   return scope.status();
 }
@@ -162,7 +162,7 @@ Status CheckNumericsGrad(const Scope& scope, const Operation& op,
                          const std::vector<Output>& grad_inputs,
                          std::vector<Output>* grad_outputs) {
   string message;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "message", &message));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "message", &message));
   string err_msg = strings::StrCat(
       "Not a number (NaN) or infinity (Inf) values detected in gradient. ",
       message);
@@ -215,9 +215,9 @@ Status ReverseSequenceGrad(const Scope& scope, const Operation& op,
                            std::vector<Output>* grad_outputs) {
   auto seq_lengths = op.input(1);
   int batch_dim;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "batch_dim", &batch_dim));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "batch_dim", &batch_dim));
   int seq_dim;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "seq_dim", &seq_dim));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "seq_dim", &seq_dim));
   grad_outputs->push_back(
       ReverseSequence(scope, grad_inputs[0], seq_lengths, seq_dim,
                       ReverseSequence::BatchDim(batch_dim)));
@@ -267,7 +267,8 @@ Status SpaceToBatchGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(
       BatchToSpace(scope, grad_inputs[0], op.input(1), block_size));
   grad_outputs->push_back(NoGradient());
@@ -290,7 +291,8 @@ Status BatchToSpaceGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(
       SpaceToBatch(scope, grad_inputs[0], op.input(1), block_size));
   grad_outputs->push_back(NoGradient());
@@ -313,7 +315,8 @@ Status SpaceToDepthGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(DepthToSpace(scope, grad_inputs[0], block_size));
   return scope.status();
 }
@@ -323,7 +326,8 @@ Status DepthToSpaceGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(SpaceToDepth(scope, grad_inputs[0], block_size));
   return scope.status();
 }
@@ -333,7 +337,7 @@ Status MirrorPadGrad(const Scope& scope, const Operation& op,
                      const std::vector<Output>& grad_inputs,
                      std::vector<Output>* grad_outputs) {
   string mode;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "mode", &mode));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(tensorflow::ops::internal::MirrorPadGrad(
       scope, grad_inputs[0], op.input(1), mode));
   grad_outputs->push_back(NoGradient());
@@ -346,7 +350,7 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op,
                          const std::vector<Output>& grad_inputs,
                          std::vector<Output>* grad_outputs) {
   string mode;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "mode", &mode));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(MirrorPad(scope, grad_inputs[0], op.input(1), mode));
   grad_outputs->push_back(NoGradient());
   return scope.status();
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index aff0653139538820a705371ee9446a3d38ca69b5..8c1a01f518f9ad3a4571c2f36c01d4eae712e813 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -21,6 +21,17 @@ namespace tensorflow {
 namespace ops {
 namespace {
 
+// Conjugate helper function returns the conjugate of an Output if it
+// is complex valued.
+Output ConjugateHelper(const Scope& scope, const Output& out) {
+  DataType dtype = out.type();
+  if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
+    return Conj(scope, out);
+  } else {
+    return out;
+  }
+}
+
 // TODO(andydavis) Add control dependencies to gradient functions (as needed).
 
 Status AbsGrad(const Scope& scope, const Operation& op,
@@ -44,9 +55,11 @@ REGISTER_GRADIENT_OP("Neg", NegGrad);
 Status InvGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
-  // dx = dy * (-1 * (y * y))
+  // dy/dx = -1/x^2 = -y^2
+  auto dydx = Neg(scope, Square(scope, op.output(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], Neg(scope, Square(scope, op.output(0)))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Inv", InvGrad);
@@ -55,10 +68,12 @@ REGISTER_GRADIENT_OP("Reciprocal", InvGrad);
 Status SquareGrad(const Scope& scope, const Operation& op,
                   const std::vector<Output>& grad_inputs,
                   std::vector<Output>* grad_outputs) {
-  // dx = dy * (2 * x)
+  // dy/dx = (2 * x)
   auto two = Cast(scope, Const(scope, 2), op.input(0).type());
+  auto dydx = Mul(scope, two, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], Mul(scope, two, op.input(0))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Square", SquareGrad);
@@ -68,11 +83,12 @@ Status SqrtGrad(const Scope& scope, const Operation& op,
                 std::vector<Output>* grad_outputs) {
   // y = sqrt(x)
   // dy/dx =  0.5 * (1 / sqrt(x)) = 0.5 * (1 / y)
-  // dx = dy * (0.5 * (1 / y))
   auto y_inv = Reciprocal(scope, op.output(0));
   auto half = Cast(scope, Const(scope, 0.5), op.input(0).type());
-  auto dx = Mul(scope, grad_inputs[0], Mul(scope, half, y_inv));
-  grad_outputs->push_back(dx);
+  auto dydx = Mul(scope, half, y_inv);
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Sqrt", SqrtGrad);
@@ -82,14 +98,14 @@ Status RsqrtGrad(const Scope& scope, const Operation& op,
                  std::vector<Output>* grad_outputs) {
   // y = 1/x^1/2 = x^-1/2
   // dy/dx = -1/2 * x^-3/2 = -1/2 * x^-1/2 * x^-1 = -1/2 * y * x^-1
-  // dx = dy * (-1/2 * y * x^-1)
   auto x_inv = Reciprocal(scope, op.input(0));
   auto y = op.output(0);
   auto neghalf = Cast(scope, Const(scope, -0.5), op.input(0).type());
   auto a = Mul(scope, neghalf, x_inv);
-  auto b = Mul(scope, a, y);
-  auto dx = Mul(scope, grad_inputs[0], b);
-  grad_outputs->push_back(dx);
+  auto dydx = Mul(scope, a, y);
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Rsqrt", RsqrtGrad);
@@ -97,10 +113,11 @@ REGISTER_GRADIENT_OP("Rsqrt", RsqrtGrad);
 Status ExpGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
-  // y = exp(x)
-  // dy/dx = exp(x)
-  // dx = dy * y
-  grad_outputs->push_back(Mul(scope, grad_inputs[0], op.output(0)));
+  // dy/dx = exp(x) = y
+  // grad(x) = grad(y) * conj(dy/dx)
+  //         = grad(y) * conj(y)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, op.output(0))));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Exp", ExpGrad);
@@ -108,10 +125,12 @@ REGISTER_GRADIENT_OP("Exp", ExpGrad);
 Status Expm1Grad(const Scope& scope, const Operation& op,
                  const std::vector<Output>& grad_inputs,
                  std::vector<Output>* grad_outputs) {
-  // f(x) = expm1(x)
-  // df/dx = exp(x)
-  // dx = dy * exp(x)
-  grad_outputs->push_back(Mul(scope, grad_inputs[0], Exp(scope, op.input(0))));
+  // y = expm1(x)
+  // dy/dx = exp(x)
+  auto dydx = Exp(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Expm1", Expm1Grad);
@@ -119,11 +138,12 @@ REGISTER_GRADIENT_OP("Expm1", Expm1Grad);
 Status LogGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
-  // f(x) = log(x) = y
-  // df/dx = 1 / x
-  // dx = dy * (1 / x)
+  // y = log(x)
+  // dy/dx = 1 / x
+  auto dydx = Reciprocal(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], Reciprocal(scope, op.input(0))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Log", LogGrad);
@@ -131,12 +151,13 @@ REGISTER_GRADIENT_OP("Log", LogGrad);
 Status Log1pGrad(const Scope& scope, const Operation& op,
                  const std::vector<Output>& grad_inputs,
                  std::vector<Output>* grad_outputs) {
-  // f(x) = log1p(x) = y
-  // df/dx = 1 / (1 + x)
-  // dx = dy * (1 / (1 + x))
+  // y = log1p(x)
+  // dy/dx = 1 / (1 + x)
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
+  auto dydx = Reciprocal(scope, Add(scope, one, op.input(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Div(scope, grad_inputs[0], Add(scope, one, op.input(0))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Log1p", Log1pGrad);
@@ -146,11 +167,12 @@ Status TanhGrad(const Scope& scope, const Operation& op,
                 std::vector<Output>* grad_outputs) {
   // y = tanh(x)
   // dy/dx = 1 - (tanh(x))^2 = 1 - y^2
-  // dx = dy * (1 - y^2)
   auto y2 = Square(scope, op.output(0));
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
-  auto dx = Mul(scope, grad_inputs[0], Sub(scope, one, y2));
-  grad_outputs->push_back(dx);
+  auto dydx = Sub(scope, one, y2);
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Tanh", TanhGrad);
@@ -160,11 +182,13 @@ Status SigmoidGrad(const Scope& scope, const Operation& op,
                    std::vector<Output>* grad_outputs) {
   // y = 1 / (1 + exp(-x))
   // dy/dx = y * (1 - y)
-  // dx = dy * y * (1 - y)
   auto y = op.output(0);
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
-  auto dx = Mul(scope, grad_inputs[0], Mul(scope, y, Sub(scope, one, y)));
-  grad_outputs->push_back(dx);
+  auto dydx = Mul(scope, y, Sub(scope, one, y));
+  // dx = dy * y * (1 - y)
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Sigmoid", SigmoidGrad);
@@ -185,9 +209,10 @@ Status SinGrad(const Scope& scope, const Operation& op,
                std::vector<Output>* grad_outputs) {
   // y = sin(x)
   // dy/dx = cos(x)
-  // dx = dy * cos(x)
-  auto dx = Mul(scope, grad_inputs[0], Cos(scope, op.input(0)));
-  grad_outputs->push_back(dx);
+  auto dydx = Cos(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Sin", SinGrad);
@@ -197,9 +222,10 @@ Status CosGrad(const Scope& scope, const Operation& op,
                std::vector<Output>* grad_outputs) {
   // y = cos(x)
   // dy/dx = -sin(x)
-  // dx = dy * -sin(x)
-  auto dx = Mul(scope, grad_inputs[0], Neg(scope, Sin(scope, op.input(0))));
-  grad_outputs->push_back(dx);
+  auto dydx = Neg(scope, Sin(scope, op.input(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Cos", CosGrad);
@@ -208,12 +234,12 @@ Status AsinGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
   // y = asin(x)
-  // dy/dx = 1 / (1 - x * x)^1/2
-  // dx = dy * (1 / (1 - x * x)^1/2)
+  // dy/dx = 1 / sqrt(1 - x^2)
   auto x2 = Square(scope, op.input(0));
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
   auto dydx = Reciprocal(scope, Sqrt(scope, Sub(scope, one, x2)));
-  auto dx = Mul(scope, grad_inputs[0], dydx);
+  // grad(x) = grad(y) * conj(dy/dx)
+  auto dx = Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -239,9 +265,9 @@ Status TanGrad(const Scope& scope, const Operation& op,
                std::vector<Output>* grad_outputs) {
   // y = tan(x)
   // dy/dx = sec(x)^2 = 1 / cos(x)^2
-  // dx = dy * (1 / cos(x)^2)
   auto dydx = Square(scope, Reciprocal(scope, Cos(scope, op.input(0))));
-  auto dx = Mul(scope, grad_inputs[0], dydx);
+  // grad(x) = grad(y) * conj(dy/dx)
+  auto dx = Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -324,7 +350,7 @@ Status MatMulGradCommon(const Scope& scope, const Operation& op,
                         const string& attr_adj_x, const string& attr_adj_y,
                         std::vector<Output>* grad_outputs) {
   DataType dtype;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), "T", &dtype));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->attrs(), "T", &dtype));
   if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
     return errors::Unimplemented(
         "MatMul gradient for complex data type is not supported yet.");
@@ -332,8 +358,10 @@ Status MatMulGradCommon(const Scope& scope, const Operation& op,
 
   bool ta;
   bool tb;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_x, &ta));
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_y, &tb));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), attr_adj_x, &ta));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), attr_adj_y, &tb));
 
   if (!ta && !tb) {
     return MatMulGradHelper(scope, is_batch, grad_inputs[0], false, op.input(1),
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index d7278929d4651f17d25670934b15e6da33d6a960..de6baa176936bcda7d0899c3795e1fbd37627058 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -56,23 +56,25 @@ class CWiseUnaryGradTest : public ::testing::Test {
     ATAN
   };
 
-  void TestCWiseGrad(UnaryOpType op_type, std::function<float(int)> x_fn,
-                     std::function<float(float)> dy_fn,
-                     std::function<float(float, float)> dx_fn) {
-    Tensor x(DT_FLOAT, {2, 3, 2});
-    auto x_flat = x.flat<float>();
+  template <typename T>
+  void TestCWiseGrad(UnaryOpType op_type, const std::function<T(int)>& x_fn,
+                     const std::function<T(const T&)>& dy_fn,
+                     const std::function<T(const T&, const T&)>& dx_fn) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    Tensor x(dtype, {2, 3, 2});
+    auto x_flat = x.flat<T>();
     for (int i = 0; i < x_flat.size(); ++i) {
       x_flat(i) = x_fn(i);
     }
 
-    Tensor dy(DT_FLOAT, {2, 3, 2});
-    auto dy_flat = dy.flat<float>();
+    Tensor dy(dtype, {2, 3, 2});
+    auto dy_flat = dy.flat<T>();
     for (int i = 0; i < dy_flat.size(); ++i) {
       dy_flat(i) = dy_fn(x_flat(i));
     }
 
-    Tensor dx(DT_FLOAT, {2, 3, 2});
-    auto dx_flat = dx.flat<float>();
+    Tensor dx(dtype, {2, 3, 2});
+    auto dx_flat = dx.flat<T>();
     for (int i = 0; i < dx_flat.size(); ++i) {
       dx_flat(i) = dx_fn(x_flat(i), dy_flat(i));
     }
@@ -146,7 +148,19 @@ class CWiseUnaryGradTest : public ::testing::Test {
     test::ExpectClose(output, dx);
   }
 
-  float RV(std::vector<float> v) { return v[random::New64() % v.size()]; }
+  float RV(const std::vector<float>& v) {
+    return v[random::New64() % v.size()];
+  }
+
+  complex64 CRV(const std::vector<complex64>& v) {
+    return v[random::New64() % v.size()];
+  }
+
+  complex64 conjugate(const complex64& val) {
+    return complex64(val.real(), -val.imag());
+  }
+
+  const complex64 one_{1.0, 0};
 
   Scope scope_;
 };
@@ -155,14 +169,14 @@ TEST_F(CWiseUnaryGradTest, Abs) {
   auto x_fn = [this](const int i) { return RV({-1, 0, 1}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return x * dy; };
-  TestCWiseGrad(ABS, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ABS, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Neg) {
   auto x_fn = [this](const int i) { return RV({-1, 0, 1}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return -dy; };
-  TestCWiseGrad(NEG, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(NEG, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Reciprocal) {
@@ -171,14 +185,36 @@ TEST_F(CWiseUnaryGradTest, Reciprocal) {
   auto dx_fn = [this](const float x, const float dy) {
     return -(1 / (x * x)) * dy;
   };
-  TestCWiseGrad(INV, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(INV, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Reciprocal_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64 x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64 x, const complex64 dy) {
+    return -conjugate(one_ / (x * x)) * dy;
+  };
+  TestCWiseGrad<complex64>(INV, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Square) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return RV({0, -7, 7, -8, 8, -9, 9}); };
   auto dx_fn = [this](const float x, const float dy) { return 2 * x * dy; };
-  TestCWiseGrad(SQUARE, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SQUARE, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Square_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return conjugate(complex64(2, 0) * x) * dy;
+  };
+  TestCWiseGrad<complex64>(SQUARE, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sqrt) {
@@ -187,7 +223,18 @@ TEST_F(CWiseUnaryGradTest, Sqrt) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * 0.5 * (1.0 / std::sqrt(x));
   };
-  TestCWiseGrad(SQRT, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SQRT, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sqrt_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return conjugate(complex64(0.5, 0) / std::sqrt(x)) * dy;
+  };
+  TestCWiseGrad<complex64>(SQRT, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Rsqrt) {
@@ -196,7 +243,18 @@ TEST_F(CWiseUnaryGradTest, Rsqrt) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * -0.5 * (1 / std::sqrt(x)) * (1 / x);
   };
-  TestCWiseGrad(RSQRT, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(RSQRT, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Rsqrt_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return conjugate(complex64(-0.5, 0) / std::sqrt(x) / x) * dy;
+  };
+  TestCWiseGrad<complex64>(RSQRT, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Exp) {
@@ -205,7 +263,18 @@ TEST_F(CWiseUnaryGradTest, Exp) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * std::exp(x);
   };
-  TestCWiseGrad(EXP, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(EXP, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Exp_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::exp(x));
+  };
+  TestCWiseGrad<complex64>(EXP, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Expm1) {
@@ -214,14 +283,36 @@ TEST_F(CWiseUnaryGradTest, Expm1) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * std::exp(x);
   };
-  TestCWiseGrad(EXPM1, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(EXPM1, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Expm1_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::exp(x));
+  };
+  TestCWiseGrad<complex64>(EXPM1, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Log) {
   auto x_fn = [this](const int i) { return RV({-1, 1, -2, 2, -3, 3, -4, 4}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return dy * (1.0 / x); };
-  TestCWiseGrad(LOG, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(LOG, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Log_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(one_ / x);
+  };
+  TestCWiseGrad<complex64>(LOG, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Log1p) {
@@ -230,7 +321,20 @@ TEST_F(CWiseUnaryGradTest, Log1p) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (1.0 / (1.0 + x));
   };
-  TestCWiseGrad(LOG1P, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(LOG1P, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Log1p_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{0, 0}, {1e-6, 0}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / (one_ + conjugate(x));
+  };
+  TestCWiseGrad<complex64>(LOG1P, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Tanh) {
@@ -240,7 +344,21 @@ TEST_F(CWiseUnaryGradTest, Tanh) {
     const float y = std::tanh(x);
     return dy * (1.0 - y * y);
   };
-  TestCWiseGrad(TANH, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(TANH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Tanh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    const complex64 y = std::tanh(x);
+    return dy * conjugate((one_ - y * y));
+  };
+  TestCWiseGrad<complex64>(TANH, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sigmoid) {
@@ -250,14 +368,28 @@ TEST_F(CWiseUnaryGradTest, Sigmoid) {
     const float y = 1.0 / (1.0 + std::exp(-x));
     return dy * y * (1.0 - y);
   };
-  TestCWiseGrad(SIGMOID, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SIGMOID, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sigmoid_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 0}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    const complex64 y = one_ / (one_ + std::exp(-x));
+    return dy * conjugate(y * (one_ - y));
+  };
+  TestCWiseGrad<complex64>(SIGMOID, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sign) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return 0.0; };
-  TestCWiseGrad(SIGN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SIGN, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sin) {
@@ -266,7 +398,20 @@ TEST_F(CWiseUnaryGradTest, Sin) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * std::cos(x);
   };
-  TestCWiseGrad(SIN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SIN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sin_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::cos(x));
+  };
+  TestCWiseGrad<complex64>(SIN, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Cos) {
@@ -275,7 +420,20 @@ TEST_F(CWiseUnaryGradTest, Cos) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * -1.0 * std::sin(x);
   };
-  TestCWiseGrad(COS, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(COS, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Cos_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(-std::sin(x));
+  };
+  TestCWiseGrad<complex64>(COS, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Asin) {
@@ -284,7 +442,24 @@ TEST_F(CWiseUnaryGradTest, Asin) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (1.0 / std::sqrt(1.0 - x * x));
   };
-  TestCWiseGrad(ASIN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ASIN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Asin_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / conjugate(std::sqrt(one_ - x * x));
+  };
+  // TODO(kbsriram)
+  // Enable test when the asin kernel supports complex numbers
+  if (false) {
+    TestCWiseGrad<complex64>(ASIN, x_fn, dy_fn, dx_fn);
+  }
 }
 
 TEST_F(CWiseUnaryGradTest, Acos) {
@@ -293,7 +468,24 @@ TEST_F(CWiseUnaryGradTest, Acos) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (-1.0 / std::sqrt(1.0 - x * x));
   };
-  TestCWiseGrad(ACOS, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ACOS, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Acos_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / -conjugate(std::sqrt(one_ - x * x));
+  };
+  // TODO(kbsriram)
+  // Add test when the acos kernel supports complex numbers
+  if (false) {
+    TestCWiseGrad<complex64>(ACOS, x_fn, dy_fn, dx_fn);
+  }
 }
 
 TEST_F(CWiseUnaryGradTest, Tan) {
@@ -303,7 +495,25 @@ TEST_F(CWiseUnaryGradTest, Tan) {
     const float cosx = std::cos(x);
     return dy * (1 / (cosx * cosx));
   };
-  TestCWiseGrad(TAN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(TAN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Tan_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    const complex64 cosx = std::cos(x);
+    return dy / conjugate(cosx * cosx);
+  };
+  // TODO(kbsriram)
+  // Enable when tan kernel supports complex inputs
+  if (false) {
+    TestCWiseGrad<complex64>(TAN, x_fn, dy_fn, dx_fn);
+  }
 }
 
 TEST_F(CWiseUnaryGradTest, Atan) {
@@ -312,7 +522,24 @@ TEST_F(CWiseUnaryGradTest, Atan) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (1 / (1 + x * x));
   };
-  TestCWiseGrad(ATAN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ATAN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Atan_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / (one_ + x * x);
+  };
+  // TODO(kbsriram)
+  // Add test when the atan kernel supports complex numbers
+  if (false) {
+    TestCWiseGrad<complex64>(ATAN, x_fn, dy_fn, dx_fn);
+  }
 }
 
 class CWiseUnaryComplexGradTest : public ::testing::Test {
diff --git a/tensorflow/cc/ops/const_op_test.cc b/tensorflow/cc/ops/const_op_test.cc
index 5a4770f879ff9a1422a63a88bd2b67ba201a0567..3184edeb3307cafcbfbc41c6477fd092ab613b46 100644
--- a/tensorflow/cc/ops/const_op_test.cc
+++ b/tensorflow/cc/ops/const_op_test.cc
@@ -28,9 +28,9 @@ void ExpectNodeEqual(const Node* n, gtl::ArraySlice<T> values,
                      TensorShape shape) {
   EXPECT_TRUE(n->IsConstant());
   Tensor tensor;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor));
   DataType dtype;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
   EXPECT_EQ(tensor.dtype(), dtype);
   test::ExpectTensorEqual<T>(tensor, test::AsTensor(values, shape));
 }
@@ -39,9 +39,9 @@ void ExpectTypeAndShape(const Node* n, DataType expected_dtype,
                         TensorShape expected_shape) {
   EXPECT_TRUE(n->IsConstant());
   Tensor tensor;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor));
   DataType dtype;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
   EXPECT_EQ(dtype, expected_dtype);
   EXPECT_EQ(expected_shape, TensorShape(tensor.shape()));
 }
diff --git a/tensorflow/cc/ops/op_gen_overrides.pbtxt b/tensorflow/cc/ops/op_gen_overrides.pbtxt
index cd94ddf4a1b67d3b98da7769db95bbda294e76db..1dffb10c03379571907e921c1add98d1f11625c3 100644
--- a/tensorflow/cc/ops/op_gen_overrides.pbtxt
+++ b/tensorflow/cc/ops/op_gen_overrides.pbtxt
@@ -22,7 +22,7 @@ op { name: "Where" input_rename: { from: "input" to: "condition" } }
 op { name: "ThreadUnsafeUnigramCandidateSampler", skip: true }
 
 # control_flow_ops
-# TODO(josh11b): Hide Switch and Merge once we write and migrate users to
+# TODO(joshl): Hide Switch and Merge once we write and migrate users to
 # a Cond() API.
 #op { name: "Switch" hide: true }
 #op { name: "Merge" hide: true }
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index b144bfc33e46c3db192cfb1e3ef8a0633e9fa519..908aa01a3470b67233c61d150ea955c1c13a8cd3 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -36,7 +36,7 @@ auto* load_attempt_count = monitoring::Counter<2>::New(
     "status");
 auto* load_latency = monitoring::Counter<1>::New(
     "/tensorflow/cc/saved_model/load_latency",
-    "Latency in microseconds for SavedModels that were succesfully loaded.",
+    "Latency in microseconds for SavedModels that were successfully loaded.",
     "model_path");
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index c52a56b6428fb8a8415ed53477ba3e81c57b0ded..c12005a4cab903c15a4f95efa0fdc3b8b2563942 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -73,7 +73,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/core:core_cpu",
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 042a72745a78c4a11b22c85e3a094d78c4ab2ed5..bbdb342a623f5d4435e437fbb94e282b685751c9 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -152,8 +152,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
 string RewriteWithName(const string& name, string code,
                        const std::vector<std::pair<string, string>>& rewrites) {
   str_util::ReplaceAllPairs(&code, rewrites);
-  str_util::ReplaceAll(&code, "{{NAME}}", name);
-  return code;
+  return str_util::StringReplace(code, "{{NAME}}", name, /*replace_all=*/true);
 }
 
 // Generate methods for args (inputs).
@@ -366,7 +365,7 @@ Status GenerateHeader(const HeaderOpts& opts, const Config& config,
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace Eigen { class ThreadPoolDevice; }
+namespace Eigen { struct ThreadPoolDevice; }
 
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void {{ENTRY}}(
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 46d7c03006a1344df17fc99c8b837f31ee86feb9..01963c6df4682ec8c23a93201d7fbbab63558060 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -15,7 +15,7 @@
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace Eigen { class ThreadPoolDevice; }
+namespace Eigen { struct ThreadPoolDevice; }
 
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void entry_point(
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 4b5534c164887ed0f3656808d8d328bb7b4f5975..0c7b97b01f43ea255ed4b7773ab5268396e7c306 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -203,14 +203,14 @@ Status RewriteAndPruneGraph(Graph* graph, const Config& config,
   for (const Node* n : graph->nodes()) {
     if (n->type_string() == kArgOp) {
       string feed_id;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFeedIdAttr, &feed_id));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFeedIdAttr, &feed_id));
       if (missing_feeds.erase(feed_id) == 0) {
         return errors::Aborted(kArgOp,
                                " node found with unknown feed id: ", feed_id);
       }
     } else if (n->type_string() == kRetvalOp) {
       string fetch_id;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFetchIdAttr, &fetch_id));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFetchIdAttr, &fetch_id));
       if (missing_fetches.erase(fetch_id) == 0) {
         return errors::Aborted(kRetvalOp,
                                " node found with unknown fetch id: ", fetch_id);
@@ -234,7 +234,7 @@ Status CollectArgNodes(const Graph& graph, std::vector<Node*>* arg_nodes) {
   for (Node* n : graph.nodes()) {
     if (n->type_string() == kArgOp) {
       int index;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       auto insert_result = indexed_arg_nodes.insert({index, n});
       if (!insert_result.second) {
         const Node* dup = insert_result.first->second;
@@ -264,9 +264,9 @@ Status CreateXlaArgs(const Graph& graph,
   for (const Node* node : arg_nodes) {
     XlaCompiler::Argument arg;
     arg.kind = XlaCompiler::Argument::kParameter;
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "T", &arg.type));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), kShapeAttr, &arg.shape));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), kDebugNameAttr, &arg.name));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
   return Status::OK();
@@ -274,7 +274,8 @@ Status CreateXlaArgs(const Graph& graph,
 
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
-Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
+Status ConvertGraphToXla(xla::CompileOnlyClient* client,
+                         std::unique_ptr<Graph> graph,
                          xla::Computation* computation, bool* has_context_arg) {
   // Create a device and context to convert the graph into an XLA computation.
   XlaOpRegistry::RegisterCompilationKernels();
@@ -288,18 +289,19 @@ Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
   // Compile the graph into an XLA computation.
   XlaCompiler::Options compiler_options;
   compiler_options.client = client;
-  compiler_options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+  DeviceType device_type(DEVICE_CPU_XLA_JIT);
+  compiler_options.device_type = &device_type;
+  compiler_options.flib_def = &graph->flib_def();
+  compiler_options.graph_def_version = graph->versions().producer();
   compiler_options.allow_cpu_custom_calls = true;
   XlaCompiler compiler(compiler_options);
 
-  std::unique_ptr<FunctionLibraryRuntime> flib_run(NewFunctionLibraryRuntime(
-      compiler.device_mgr(), Env::Default(), compiler.device(),
-      graph->versions().producer(), &graph->flib_def(), OptimizerOptions()));
   XlaCompiler::CompilationResult result;
-  TF_RETURN_IF_ERROR(compiler.CompileGraph("tfcompile", std::move(graph),
-                                           flib_run.get(), xla_args, &result));
+  TF_RETURN_IF_ERROR(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                           "tfcompile", std::move(graph),
+                                           xla_args, &result));
   *has_context_arg = result.requires_runtime_context;
-  *computation = std::move(result.computation);
+  *computation = std::move(*result.computation);
 
   int num_const_results = 0;
   for (int i = 0; i < result.outputs.size(); ++i) {
@@ -333,7 +335,8 @@ Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
 }
 
 // Compiles the XLA computation into executable code.
-Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
+Status CompileXla(xla::CompileOnlyClient* client,
+                  const xla::Computation& computation,
                   const xla::cpu::CpuAotCompilationOptions& aot_opts,
                   CompileResult* compile_result) {
   // Retrieves arg and result layouts from the computation.
@@ -350,7 +353,7 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
   for (int i = 0; i < pshape->parameters_size(); ++i) {
     arg_layouts.push_back(pshape->mutable_parameters(i));
   }
-  xla::LocalClient::AheadOfTimeComputationInstance instance;
+  xla::CompileOnlyClient::AotComputationInstance instance;
   instance.computation = &computation;
   instance.argument_layouts = std::move(arg_layouts);
   instance.result_layout = &pshape->result();
@@ -365,7 +368,7 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
           std::move(aot_or.ValueOrDie().back()));
   compile_result->entry_point = aot_opts.entry_point_name();
   compile_result->pointer_size =
-      xla::LocalClient::PointerSizeForTriple(aot_opts.triple());
+      xla::CompileOnlyClient::PointerSizeForTriple(aot_opts.triple());
   return Status::OK();
 }
 
@@ -394,8 +397,9 @@ Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
   namespace gpu = perftools::gputools;
   gpu::Platform* cpu_platform =
       gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
-  xla::LocalClient* client =
-      xla::ClientLibrary::GetOrCreateLocalClient(cpu_platform).ValueOrDie();
+  xla::CompileOnlyClient* client =
+      xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
+          .ValueOrDie();
   xla::Computation computation;
   TF_RETURN_IF_ERROR(ConvertGraphToXla(client, std::move(graph), &computation,
                                        &compile_result->has_context_arg));
diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/aot/runtime.cc
index 208de5498dbee6773683ac1aa2b33400a8a21f35..5772776666129ed55a479c8917e69df3f3ce2fc0 100644
--- a/tensorflow/compiler/aot/runtime.cc
+++ b/tensorflow/compiler/aot/runtime.cc
@@ -31,6 +31,8 @@ namespace {
 inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN)
   return memalign(minimum_alignment, size);
+#elif defined(COMPILER_MSVC)
+  return _aligned_malloc(size, minimum_alignment);
 #else  // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN
   void* ptr = nullptr;
   // posix_memalign requires that the requested alignment be at least
@@ -45,7 +47,13 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #endif
 }
 
-inline void aligned_free(void* aligned_memory) { free(aligned_memory); }
+inline void aligned_free(void* aligned_memory) {
+#if defined(COMPILER_MSVC)
+  _aligned_free(aligned_memory);
+#else
+  free(aligned_memory);
+#endif
+}
 
 size_t align_to(size_t n, size_t align) {
   return (((n - 1) / align) + 1) * align;
diff --git a/tensorflow/compiler/aot/tfcompile_util_test.cc b/tensorflow/compiler/aot/tfcompile_util_test.cc
index 108ab1eab7bf3b087e8049c5b24d652d871789c8..c321d3ff4c779fbd2e9c67dfc1eb24c734a9103f 100644
--- a/tensorflow/compiler/aot/tfcompile_util_test.cc
+++ b/tensorflow/compiler/aot/tfcompile_util_test.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace tfcompile {
 namespace {
 
-void ExpectErrorContains(Status status, StringPiece str) {
+void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
   EXPECT_TRUE(StringPiece(status.error_message()).contains(str))
       << "expected error: " << status.error_message() << " to contain: " << str;
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index e56f173d518232791d0f490a48bd40e8f14d6cfe..9b4e872ebe561c0d919b1982339896c12bc079f9 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -19,6 +19,7 @@ package(
 )
 
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 # Target that bundles up the XLA CPU and GPU JIT devices.
 cc_library(
@@ -48,12 +49,12 @@ cc_library(
 cc_library(
     name = "xla_gpu_jit",
     visibility = [":friends"],
-    deps = [
+    deps = if_cuda([
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-    ],
+    ]),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
index abb68f73d7e3870f733c350be0dc99ab21a6b083..48eed7fce07f0855934600890e157b2752d38838 100644
--- a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
@@ -66,9 +66,9 @@ static Status ReplaceNodeWithXlaLaunch(Graph* graph, Node* node) {
 
   int num_constant_args, num_resource_args;
   TF_RETURN_IF_ERROR(
-      GetNodeAttr(node->def(), kXlaNumConstantArgsAttr, &num_constant_args));
+      GetNodeAttr(node->attrs(), kXlaNumConstantArgsAttr, &num_constant_args));
   TF_RETURN_IF_ERROR(
-      GetNodeAttr(node->def(), kXlaNumResourceArgsAttr, &num_resource_args));
+      GetNodeAttr(node->attrs(), kXlaNumResourceArgsAttr, &num_resource_args));
 
   if (num_constant_args < 0 || num_resource_args < 0 ||
       num_constant_args + num_resource_args > node->num_inputs()) {
@@ -88,7 +88,7 @@ static Status ReplaceNodeWithXlaLaunch(Graph* graph, Node* node) {
   Node* launch_node;
   TF_RETURN_IF_ERROR(BuildLaunchNode(
       graph->NewName(node->name()), node->type_string(), node->def().attr(),
-      node->def().device(), const_dtypes, num_resource_args, arg_dtypes,
+      node->requested_device(), const_dtypes, num_resource_args, arg_dtypes,
       node->output_types(), graph, &launch_node));
   launch_node->set_assigned_device_name(node->assigned_device_name());
 
@@ -173,7 +173,8 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& ndef,
   FunctionLibraryRuntime::Handle handle;
   // If ndef is not instantiable, e.g., the function does not exist,
   // simply bail out.
-  TF_RETURN_IF_ERROR(flr->Instantiate(ndef.op(), ndef.attr(), &handle));
+  TF_RETURN_IF_ERROR(
+      flr->Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
   const FunctionBody* fbody = flr->GetFunctionBody(handle);
   CHECK(fbody);  // Can't be nullptr since we just instantiated it.
   std::vector<bool> const_args(fbody->arg_types.size());
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 1d2793d3c55f4436a07e4f632887561202d0498e..88ec45f8d86643aa4f7c643ac5bee333fb2ec559 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -88,9 +88,12 @@ class Encapsulator {
 
   // Build a FunctionDef for each subgraph, and add it 'library'. The values of
   // the 'group_attribute' annotations become the function names.
+  // If 'reuse_existing_functions' is set, use an existing function with the
+  // same name, if any.
   // If 'rewrite_subgraph_fn' is set, it is applied to each subgraph before
   // function conversion.
   Status BuildFunctionDefs(const RewriteSubgraphFn& rewrite_subgraph_fn,
+                           bool reuse_existing_functions,
                            FunctionLibraryDefinition* library);
 
   // Write a copy of the input graph to 'graph_out', where the subgraphs are
@@ -162,7 +165,7 @@ static const char* const kRetValOp = "_Retval";
 // none.
 string Encapsulator::GetFunctionNameAttr(Node const* node) const {
   string attr;
-  if (!GetNodeAttr(node->def(), group_attribute_, &attr).ok()) {
+  if (!GetNodeAttr(node->attrs(), group_attribute_, &attr).ok()) {
     attr.clear();
   }
   return attr;
@@ -192,7 +195,7 @@ Status Encapsulator::SplitIntoSubgraphs() {
 
     // Check the device matches any existing device.
     string device = node->assigned_device_name().empty()
-                        ? node->def().device()
+                        ? node->requested_device()
                         : node->assigned_device_name();
 
     if (subgraph.device.empty()) {
@@ -236,9 +239,16 @@ Status Encapsulator::SplitIntoSubgraphs() {
         // Create a new _Retval node
         DataType dtype = edge->src()->output_type(edge->src_output());
 
+        if (IsRefType(dtype)) {
+          return errors::InvalidArgument(
+              "Ref Tensors (e.g., Variables) are not supported: tensor ",
+              edge->src()->name(), ":", edge->src_output());
+        }
+
         NodeDef ret_def;
         ret_def.set_op(kRetValOp);
-        ret_def.set_name(src_subgraph.graph->NewName("output"));
+        ret_def.set_name(strings::StrCat(edge->src()->name(), "_",
+                                         edge->src_output(), "_retval"));
         AddNodeAttr("T", dtype, &ret_def);
         AddNodeAttr("index", ret_index, &ret_def);
         Node* ret = src_subgraph.graph->AddNode(ret_def, &s);
@@ -263,8 +273,16 @@ Status Encapsulator::SplitIntoSubgraphs() {
         // This is the first time we have seen this tensor. Create an _Arg node.
         DataType dtype = edge->dst()->input_type(edge->dst_input());
 
+        if (IsRefType(dtype)) {
+          return errors::InvalidArgument(
+              "Ref Tensors (e.g., Variables) are not supported: tensor ",
+              edge->src()->name(), ":", edge->src_output());
+        }
+
         NodeDef arg_def;
-        NodeDefBuilder builder(dst_subgraph.graph->NewName("input"), kArgOp);
+        NodeDefBuilder builder(strings::StrCat(edge->src()->name(), "_",
+                                               edge->src_output(), "_arg"),
+                               kArgOp);
         builder.Attr("T", dtype);
         builder.Attr("index", arg_index);
         s = builder.Finalize(&arg_def);
@@ -291,11 +309,11 @@ Status Encapsulator::SplitIntoSubgraphs() {
 }
 
 Status Encapsulator::BuildFunctionDefs(
-    const RewriteSubgraphFn& rewrite_subgraph_fn,
+    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
     FunctionLibraryDefinition* library) {
   // For each subgraph, build a FunctionDef.
   for (auto& subgraph_entry : subgraphs_) {
-    const string& name = subgraph_entry.first;
+    string name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
 
     subgraph.call_node_def.set_op(name);
@@ -332,6 +350,8 @@ Status Encapsulator::BuildFunctionDefs(
       for (auto& result : subgraph.results) {
         result.second = output_permutation[result.second];
       }
+
+      name = subgraph.call_node_def.op();
     }
 
     FunctionDef fdef;
@@ -346,7 +366,9 @@ Status Encapsulator::BuildFunctionDefs(
           strings::StrCat("encapsulate_fdef_", name), fdef);
     }
 
-    TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+    if (!reuse_existing_functions || library->Find(name) == nullptr) {
+      TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+    }
   }
   return Status::OK();
 }
@@ -545,14 +567,16 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking,
 Status EncapsulateSubgraphsInFunctions(
     string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool parallel_checking,
-    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library) {
+    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
+    FunctionLibraryDefinition* library) {
   Status s;
 
   Encapsulator encapsulator(std::move(group_attribute), &graph_in);
   s = encapsulator.SplitIntoSubgraphs();
   if (!s.ok()) return s;
 
-  s = encapsulator.BuildFunctionDefs(rewrite_subgraph_fn, library);
+  s = encapsulator.BuildFunctionDefs(rewrite_subgraph_fn,
+                                     reuse_existing_functions, library);
   if (!s.ok()) return s;
 
   std::unique_ptr<Graph> out(new Graph(library));
@@ -569,7 +593,7 @@ static Status GetArgTypes(const Graph& graph, DataTypeVector* types) {
   for (Node* n : graph.nodes()) {
     if (n->type_string() == kArgOp) {
       int index;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       if (index < 0 || index >= types->size()) {
         return errors::InvalidArgument("Invalid argument number");
       }
@@ -586,7 +610,7 @@ static Status RenumberArguments(Graph* graph,
   for (Node* n : graph->nodes()) {
     if (n->type_string() == kArgOp) {
       int index;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       if (index < 0 || index >= permutation.size()) {
         return errors::InvalidArgument("Invalid argument number");
       }
@@ -674,7 +698,8 @@ Status EncapsulateSubgraphsPass::Run(
 
   TF_RETURN_IF_ERROR(EncapsulateSubgraphsInFunctions(
       kXlaClusterAttr, **options.graph, rewrite_subgraph,
-      flags->tf_xla_parallel_checking, &graph_out, library));
+      flags->tf_xla_parallel_checking, /*reuse_existing_functions=*/false,
+      &graph_out, library));
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile("after_encapsulate_subgraphs", *graph_out,
@@ -688,7 +713,7 @@ Status EncapsulateSubgraphsPass::Run(
 bool IsXlaCompiledKernel(const Node& node) {
   bool is_compiled = false;
   bool has_compilation_attr =
-      GetNodeAttr(node.def(), kXlaCompiledKernelAttr, &is_compiled).ok() &&
+      GetNodeAttr(node.attrs(), kXlaCompiledKernelAttr, &is_compiled).ok() &&
       is_compiled;
   return has_compilation_attr ? is_compiled : false;
 }
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
index 3ca7dfbf6a0ec29d9517139ffb952298d503cabc..b0987f76c91ed48df52fab303ea6052ebd8fd336 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -34,6 +34,8 @@ namespace tensorflow {
 // 'input_permutation' and 'output_permutation' are initialized to the identity
 // permutation. 'nodedef' is the NodeDef for the call to the function under
 // construction, provided to allow additional attributes to be set.
+// The rewrite may also change the NodeDef's operator name, and that
+// name will be used as the name of the generated function.
 typedef std::function<Status(
     std::unique_ptr<Graph>* graph, std::vector<int>* input_permutation,
     std::vector<int>* output_permutation, NodeDef* node_def)>
@@ -53,6 +55,9 @@ typedef std::function<Status(
 // output graph, together with a "ParallelCheck" operator, that verifies that
 // the original and encapsulated subgraphs produce similar results.
 //
+// If 'reuse_existing_functions' is set, use an existing function with the
+// same name, if any.
+//
 // TODO(phawkins): currently, some information in control edges
 // is not preserved. Suppose you have A and B in the main
 // graph, C and D in a subgraph. B and C have control deps from A, D has control
@@ -61,7 +66,8 @@ typedef std::function<Status(
 Status EncapsulateSubgraphsInFunctions(
     string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool parallel_checking,
-    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
+    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
+    FunctionLibraryDefinition* library);
 
 // The attribute that marks function calls produced by the encapsulate
 // subgraphs pass and that should in turn be compiled via _XlaLaunch operators.
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index faab7bd3d25d2491cf74faeb3b06acf4c2d6a054..a8869c8e2a7c164f97917cdae312289efb8b2663 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -76,7 +76,7 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
 #define TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(expected, actual)         \
   do {                                                            \
     string diff;                                                  \
-    EXPECT_TRUE(EqualFunctionDefLibrary(actual, expected, &diff)) \
+    EXPECT_TRUE(EqualFunctionDefLibrary(expected, actual, &diff)) \
         << diff << "\nActual: " << actual.DebugString();          \
   } while (false)
 
@@ -109,7 +109,7 @@ Node* Binary(ops::NodeOut a, ops::NodeOut b,
   return ops::BinaryOp("BinaryTest", a, b, opts);
 }
 
-Node* AddNLike(std::vector<ops::NodeOut> inputs,
+Node* AddNLike(const std::vector<ops::NodeOut>& inputs,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("AddN"), "AddNLikeTest",
@@ -144,8 +144,9 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
 
   std::unique_ptr<Graph> graph_out;
   s = EncapsulateSubgraphsInFunctions("_encapsulate", *graph,
-                                      /* rewrite_subgraph_fn= */ {},
-                                      /* parallel_checking= */ false,
+                                      /*rewrite_subgraph_fn=*/{},
+                                      /*parallel_checking=*/false,
+                                      /*reuse_existing_functions=*/false,
                                       &graph_out, lib_def.get());
   if (!s.ok()) return s;
 
@@ -205,12 +206,12 @@ TEST(EncapsulateSubgraphsTest, OneFunction) {
 
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"input__0:float", "input__1:float"}, {"output__2:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"c_0_retval:float"}, {},
       {
-          {{"C"}, "UnaryTest", {"input__0"}},
-          {{"c"}, "BinaryTest", {"input__1", "C:o:0"}, {}, {"C"}},
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}},
       },
-      {{"output__2", "c:o:0"}});
+      {{"c_0_retval", "c:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -261,17 +262,17 @@ TEST(EncapsulateSubgraphsTest, TwoFunctions) {
 
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"input__0:float"}, {"output__1:float"}, {},
+      "F1", {"a_0_arg:float"}, {"c_0_retval:float"}, {},
       {
-          {{"C"}, "UnaryTest", {"input__0"}},
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
       },
-      {{"output__1", "C:o:0"}});
+      {{"c_0_retval", "C:o:0"}});
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F2", {"input__0:float", "input__1:float"}, {"output__2:float"}, {},
+      "F2", {"b_0_arg:float", "c_0_arg:float"}, {"d_0_retval:float"}, {},
       {
-          {{"D"}, "BinaryTest", {"input__0", "input__1"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "c_0_arg"}},
       },
-      {{"output__2", "D:o:0"}});
+      {{"d_0_retval", "D:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -340,7 +341,8 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
       "_cluster", graph_before_encapsulation, /*rewrite_subgraph_fn=*/{},
-      /*parallel_checking=*/false, &graph, &library));
+      /*parallel_checking=*/false, /*reuse_existing_functions=*/false, &graph,
+      &library));
 
   std::vector<string> expected_nodes = {"cluster1", "cluster2", "mul", "x"};
   EXPECT_EQ(expected_nodes, GraphNodes(*graph));
@@ -371,7 +373,8 @@ TEST(EncapsulateSubgraphsTest, ParallelChecking) {
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
       "_cluster", graph_before_encapsulation, /*rewrite_subgraph_fn=*/{},
-      /*parallel_checking=*/true, &graph, &library));
+      /*parallel_checking=*/true, /*reuse_existing_functions=*/false, &graph,
+      &library));
 
   std::vector<string> expected_nodes = {
       "add1", "add2", "cluster1", "cluster1_parallel_check/_0",
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.cc b/tensorflow/compiler/jit/graph_to_functiondef.cc
index 88e292a2c1ad8213bc49589a104b38622dee8327..83c23385008d56859b81abee7d292276036a45ee 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.cc
+++ b/tensorflow/compiler/jit/graph_to_functiondef.cc
@@ -126,8 +126,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
     if (node->type_string() == kArgOp) {
       int index;
       DataType type;
-      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "T", &type));
-      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &type));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
       while (fdef->signature().input_arg_size() <= index) {
         fdef->mutable_signature()->add_input_arg();
       }
@@ -143,8 +143,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
     if (node->type_string() == kRetValOp) {
       int index;
       DataType type;
-      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "T", &type));
-      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &type));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
       while (fdef->signature().output_arg_size() <= index) {
         fdef->mutable_signature()->add_output_arg();
       }
@@ -161,7 +161,7 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
     }
 
     NodeDef* node_def = fdef->add_node_def();
-    node_def->CopyFrom(node->def());
+    *node_def = node->def();
     node_def->set_name(node_names.Uniquify(node->name()));
 
     // Reset input names based on graph rather than the NodeDef.
@@ -203,8 +203,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
 
     // Populate tensor_renaming.
     NameRangeMap output_ranges;
-    TF_RETURN_IF_ERROR(NameRangesForNode(node->def(), node->op_def(), nullptr,
-                                         &output_ranges));
+    TF_RETURN_IF_ERROR(
+        NameRangesForNode(*node, node->op_def(), nullptr, &output_ranges));
     for (const auto& output : output_ranges) {
       for (int i = output.second.first; i < output.second.second; ++i) {
         const string tensor_name = strings::StrCat(
diff --git a/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
index c741ccfb31efa8794ae745e2e52e3c91b20cfcfc..29c5ff724299ec84d31268c4227259ec02d10742 100644
--- a/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 
 namespace {
 
-Status BuildCompilationCache(ResourceMgr* rm, XlaCompilationCache** compiler) {
+Status BuildCompilationCache(ResourceMgr* rm, XlaCompilationCache** cache) {
   XlaDevice::Metadata* metadata;
   Status s = rm->Lookup<XlaDevice::Metadata>(rm->default_container(),
                                              "xla_metadata", &metadata);
@@ -42,12 +42,8 @@ Status BuildCompilationCache(ResourceMgr* rm, XlaCompilationCache** compiler) {
     return s;
   }
   core::ScopedUnref metadata_ref(metadata);
-  XlaCompiler::Options options;
-  options.device_type = metadata->jit_device_type();
-  options.client = metadata->client();
-  options.allow_cpu_custom_calls = false;
-  options.local_executable_has_hybrid_result = false;
-  *compiler = new XlaCompilationCache(options);
+  *cache =
+      new XlaCompilationCache(metadata->client(), metadata->jit_device_type());
   return Status::OK();
 }
 
@@ -59,7 +55,7 @@ XlaDeviceLaunchOp::XlaDeviceLaunchOp(OpKernelConstruction* ctx)
   OP_REQUIRES_OK(ctx, ctx->GetAttr("function", &func));
   function_ = *func;
   VLOG(1) << "XlaDeviceLaunch created function="
-          << Canonicalize(function_.name(), function_.attr());
+          << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
   DataTypeVector constant_types;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Tconstants", &constant_types));
   num_constant_args_ = constant_types.size();
@@ -85,29 +81,37 @@ std::vector<OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
 
 void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XlaDeviceLaunch::Compute "
-          << Canonicalize(function_.name(), function_.attr());
+          << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
   ResourceMgr* rm = ctx->resource_manager();
   OP_REQUIRES(ctx, rm, errors::Internal("No resource manager."));
 
-  XlaCompilationCache* compiler;
+  XlaCompilationCache* cache;
   OP_REQUIRES_OK(ctx, rm->LookupOrCreate<XlaCompilationCache>(
-                          rm->default_container(), "xla_compiler", &compiler,
-                          [rm](XlaCompilationCache** compiler) {
-                            return BuildCompilationCache(rm, compiler);
+                          rm->default_container(), "xla_compiler", &cache,
+                          [rm](XlaCompilationCache** cache) {
+                            return BuildCompilationCache(rm, cache);
                           }));
   // Holds the reference to the JIT during evaluation. (We could probably
   // free it sooner because the ResourceMgr will retain a reference, but
   // this is more obviously correct.)
-  core::ScopedUnref compiler_ref(compiler);
+  core::ScopedUnref cache_ref(cache);
 
   std::vector<OptionalTensor> variables =
       SnapshotResourceVariables(ctx, num_resource_args_);
 
+  XlaCompiler::Options options;
+  options.client = cache->client();
+  options.device_type = &cache->device_type();
+  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+  options.graph_def_version = ctx->function_library()->graph_def_version();
+  options.allow_cpu_custom_calls = false;
+  options.local_executable_has_hybrid_result = false;
+
   const XlaCompiler::CompilationResult* kernel;
-  OP_REQUIRES_OK(ctx, compiler->Compile(function_, num_constant_args_,
-                                        variables, ctx, &kernel, nullptr));
+  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, num_constant_args_,
+                                     variables, ctx, &kernel, nullptr));
 
   VLOG(1) << "XLA compilation complete...";
 
@@ -117,7 +121,7 @@ void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
   // Runs the computation, if any. There might not be a computation if all
   // outputs were compile-time constants.
   std::vector<std::unique_ptr<xla::GlobalData>> outputs;
-  if (!kernel->computation.IsNull()) {
+  if (!kernel->computation->IsNull()) {
     auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
 
     // Builds the inputs to the computation.
@@ -148,8 +152,8 @@ void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
     Env* env = Env::Default();
     auto start_time = env->NowMicros();
     VLOG(1) << "Executing XLA Computation...";
-    auto result = compiler->client()->Execute(kernel->computation, arg_ptrs,
-                                              &execution_options, &profile);
+    auto result = cache->client()->Execute(*kernel->computation, arg_ptrs,
+                                           &execution_options, &profile);
     auto elapsed = env->NowMicros() - start_time;
     OP_REQUIRES(ctx, result.ok(), result.status());
 
@@ -158,7 +162,7 @@ void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
 
     if (xla::ShapeUtil::IsTuple(kernel->xla_output_shape)) {
       auto outputs_or_error =
-          compiler->client()->DeconstructTuple(*result.ValueOrDie());
+          cache->client()->DeconstructTuple(*result.ValueOrDie());
       OP_REQUIRES(ctx, outputs_or_error.ok(), outputs_or_error.status());
       outputs = outputs_or_error.ConsumeValueOrDie();
     } else {
diff --git a/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
index 8b43c7c1564a340b70e8cfa271a3ef50379b46bc..40acc0d81d08230b373823e333cd5e3e407b9c4f 100644
--- a/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
@@ -148,24 +148,28 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
   OP_REQUIRES(ctx, num_resource_args == 0,
               errors::Unimplemented(
                   "XlaLocalLaunchOp does not support resource variables"));
-}
-
-Status XlaLocalLaunchOp::BuildCompilationCache(XlaCompilationCache** compiler) {
-  gpu::Platform::Id platform_id;
   if (device_type_ == DeviceType(DEVICE_CPU)) {
-    platform_id = gpu::host::kHostPlatformId;
+    platform_id_ = gpu::host::kHostPlatformId;
   } else if (device_type_ == DeviceType(DEVICE_GPU)) {
-    platform_id = gpu::cuda::kCudaPlatformId;
+    platform_id_ = gpu::cuda::kCudaPlatformId;
   } else {
-    return errors::InvalidArgument("Unknown device type for local _XlaLaunch");
+    ctx->SetStatus(
+        errors::InvalidArgument("Unknown device type for local _XlaLaunch"));
+    return;
   }
+}
 
-  auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id);
+Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
+                                               XlaCompilationCache** cache) {
+  auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id_);
   if (!platform.ok()) {
     return StreamExecutorUtil::ConvertStatus(platform.status());
   }
-  auto client =
-      xla::ClientLibrary::GetOrCreateLocalClient(platform.ValueOrDie());
+  xla::LocalClientOptions client_options;
+  client_options.set_platform(platform.ValueOrDie());
+  client_options.set_intra_op_parallelism_threads(
+      ctx->device()->tensorflow_cpu_worker_threads()->num_threads);
+  auto client = xla::ClientLibrary::GetOrCreateLocalClient(client_options);
   if (!client.ok()) {
     return client.status();
   }
@@ -175,18 +179,14 @@ Status XlaLocalLaunchOp::BuildCompilationCache(XlaCompilationCache** compiler) {
     return errors::InvalidArgument("No JIT device registered for ",
                                    device_type_.type());
   }
-  XlaCompiler::Options options;
-  options.device_type = DeviceType(registration->compilation_device_name);
-  options.client = client.ValueOrDie();
-  options.allow_cpu_custom_calls = (platform_id == gpu::host::kHostPlatformId);
-  options.local_executable_has_hybrid_result = true;
-  *compiler = new XlaCompilationCache(options);
+  *cache = new XlaCompilationCache(
+      client.ValueOrDie(), DeviceType(registration->compilation_device_name));
   return Status::OK();
 }
 
 void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XlaLocalLaunchOp::Compute "
-          << Canonicalize(function_.name(), function_.attr());
+          << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
   ResourceMgr* rm = ctx->resource_manager();
@@ -195,23 +195,31 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   gpu::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
-  XlaCompilationCache* compiler;
+  XlaCompilationCache* cache;
   OP_REQUIRES_OK(ctx, rm->LookupOrCreate<XlaCompilationCache>(
-                          rm->default_container(), "xla_compiler", &compiler,
-                          [this](XlaCompilationCache** compiler) {
-                            return BuildCompilationCache(compiler);
+                          rm->default_container(), "xla_cache", &cache,
+                          [this, ctx](XlaCompilationCache** cache) {
+                            return BuildCompilationCache(ctx, cache);
                           }));
   // Hold the reference to the JIT during evaluation. (We could probably
   // free it sooner because the ResourceMgr will retain a reference, but
   // this is more obviously correct.)
-  core::ScopedUnref compiler_ref(compiler);
+  core::ScopedUnref cache_ref(cache);
+
+  xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
-  xla::LocalClient* client = static_cast<xla::LocalClient*>(compiler->client());
+  XlaCompiler::Options options;
+  options.client = client;
+  options.device_type = &cache->device_type();
+  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+  options.graph_def_version = ctx->function_library()->graph_def_version();
+  options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.local_executable_has_hybrid_result = true;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
-  OP_REQUIRES_OK(ctx, compiler->Compile(function_, num_constant_args_, {}, ctx,
-                                        &kernel, &executable));
+  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, num_constant_args_, {},
+                                     ctx, &kernel, &executable));
 
   VLOG(1) << "Executing XLA Computation...";
 
@@ -221,7 +229,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   std::unique_ptr<xla::ShapedBuffer> output;
   bool output_is_tuple;
-  if (!kernel->computation.IsNull()) {
+  if (!kernel->computation->IsNull()) {
     // Build xla::ShapedBuffers that point directly to the Tensor buffers.
     std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers;
     arg_buffers.reserve(kernel->xla_input_shapes.size() + 1);
@@ -260,8 +268,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
     xla::ExecutableRunOptions run_options;
     run_options.set_stream(stream);
     run_options.set_allocator(&xla_allocator);
-    run_options.set_inter_op_thread_pool(
-        ctx->device()->tensorflow_cpu_worker_threads()->workers);
     run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
     Env* env = Env::Default();
     auto start_time = env->NowMicros();
diff --git a/tensorflow/compiler/jit/kernels/xla_local_launch_op.h b/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
index 8023206762951a4dafba900dd291f2ee9bdbbdf3..5e4d3336a91001fac1d222709f64300e777247c7 100644
--- a/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/stream_executor_util.h"
 
 namespace tensorflow {
 
@@ -43,11 +44,15 @@ class XlaLocalLaunchOp : public OpKernel {
 
  private:
   // Builds a XlaCompilationCache class suitable for the current device.
-  Status BuildCompilationCache(XlaCompilationCache** compiler);
+  Status BuildCompilationCache(OpKernelContext* ctx,
+                               XlaCompilationCache** compiler);
 
   DeviceType device_type_;
   NameAttrList function_;
   int num_constant_args_;
+
+  perftools::gputools::Platform::Id platform_id_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp);
 };
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index b27c07d0d987aafef1943fd795293bd066ad36f6..73c4e80551485189d1e43fd93eed39083bd6b6b7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -52,20 +52,22 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
 // Make sure we don't recurse infinitely on recursive functions.
 const int kMaxRecursionDepth = 10;
 
-bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
-                      int depth, FunctionLibraryRuntime* lib_runtime);
+bool IsCompilableCall(const NodeDef& call_def,
+                      const DeviceType& jit_device_type, int depth,
+                      FunctionLibraryRuntime* lib_runtime);
 
-// Tests whether 'while_def' is a completely compilable loop.
+// Tests whether 'while_node' is a completely compilable loop.
 // Every operator in the condition and body functions must be compilable for a
 // while loop to be compilable.
-bool IsCompilableWhile(const NodeDef& while_def, DeviceType jit_device_type,
-                       int depth, FunctionLibraryRuntime* lib_runtime) {
-  VLOG(2) << "Loop marking: " << while_def.op();
+bool IsCompilableWhile(const Node& while_node,
+                       const DeviceType& jit_device_type, int depth,
+                       FunctionLibraryRuntime* lib_runtime) {
+  VLOG(2) << "Loop marking: " << while_node.type_string();
 
   const NameAttrList* name_attr;
   NodeDef call;
   Status status;
-  status = GetNodeAttr(while_def, "cond", &name_attr);
+  status = GetNodeAttr(while_node.attrs(), "cond", &name_attr);
   if (!status.ok()) {
     VLOG(2) << "Missing 'cond' attribute on While node.";
     return false;
@@ -78,7 +80,7 @@ bool IsCompilableWhile(const NodeDef& while_def, DeviceType jit_device_type,
     VLOG(2) << "Can't compile loop condition: " << cond_func;
     return false;
   }
-  status = GetNodeAttr(while_def, "body", &name_attr);
+  status = GetNodeAttr(while_node.attrs(), "body", &name_attr);
   if (!status.ok()) {
     VLOG(2) << "Missing 'body' attribute on While node.";
     return false;
@@ -98,8 +100,9 @@ bool IsCompilableWhile(const NodeDef& while_def, DeviceType jit_device_type,
 // Tests whether 'call_def' is a call to a completely compilable function.
 // Every operator in the function must be compilable for a function to be
 // compilable.
-bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
-                      int depth, FunctionLibraryRuntime* lib_runtime) {
+bool IsCompilableCall(const NodeDef& call_def,
+                      const DeviceType& jit_device_type, int depth,
+                      FunctionLibraryRuntime* lib_runtime) {
   VLOG(2) << "Function marking: " << call_def.op();
 
   if (depth > kMaxRecursionDepth) {
@@ -109,7 +112,7 @@ bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
 
   FunctionLibraryRuntime::Handle handle;
   Status status =
-      lib_runtime->Instantiate(call_def.op(), call_def.attr(), &handle);
+      lib_runtime->Instantiate(call_def.op(), AttrSlice(call_def), &handle);
   if (!status.ok()) {
     VLOG(2) << "Could not instantiate " << call_def.op() << ": " << status;
     return false;
@@ -131,11 +134,11 @@ bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
 
   for (Node* node : fbody->graph->nodes()) {
     if (node->IsSource() || node->IsSink()) continue;
-    if (node->def().op() == "_Arg" || node->def().op() == "_Retval") continue;
-    if (node->def().op() == "While") {
+    if (node->type_string() == "_Arg" || node->type_string() == "_Retval")
+      continue;
+    if (node->type_string() == "While") {
       // Handle functional While loop (not in open source build).
-      return IsCompilableWhile(node->def(), jit_device_type, depth + 1,
-                               lib_runtime);
+      return IsCompilableWhile(*node, jit_device_type, depth + 1, lib_runtime);
     }
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, depth + 1,
@@ -189,17 +192,16 @@ Status FindCompilationCandidates(
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime.get())) {
       VLOG(2) << "Compilation rejected node: unsupported op " << node->name()
-              << ": " << node->def().op();
+              << ": " << node->type_string();
       continue;
     }
     if (!registration->compile_resource_ops && HasResourceArgument(*node)) {
       VLOG(2) << "Compilation rejected node: resource argument " << node->name()
-              << ": " << node->def().op();
+              << ": " << node->type_string();
       continue;
     }
-    if (node->def().op() == "While" &&
-        !IsCompilableWhile(node->def(), jit_device_type, 0,
-                           lib_runtime.get())) {
+    if (node->type_string() == "While" &&
+        !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime.get())) {
       continue;
     }
     candidates->insert(node);
@@ -316,10 +318,10 @@ Status MarkForCompilationPass::Run(
 
     // If there is a _XlaCompile annotation, use its value.
     bool compile = false;
-    Status status = GetNodeAttr(node->def(), kXlaCompileAttr, &compile);
+    Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
-    status = fld->GetAttr(node->def(), kXlaCompileAttr, &compile);
+    status = fld->GetAttr(*node, kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
     // Otherwise use the value of global_jit_level.
@@ -482,8 +484,8 @@ Status MarkForCompilationPass::RunImpl(
       // all nodes marked with _XlaCompile=true to also have a
       // _XlaScope property set (and raise an error otherwise); but
       // for now we don't do this.
-      if (GetNodeAttr(node_from->def(), kXlaScopeAttr, &from_scope).ok() &&
-          GetNodeAttr(node_to->def(), kXlaScopeAttr, &to_scope).ok() &&
+      if (GetNodeAttr(node_from->attrs(), kXlaScopeAttr, &from_scope).ok() &&
+          GetNodeAttr(node_to->attrs(), kXlaScopeAttr, &to_scope).ok() &&
           from_scope != to_scope) {
         continue;
       }
@@ -538,10 +540,9 @@ Status MarkForCompilationPass::RunImpl(
     // Compile if the user marked this node _XlaCompile=true
     bool compile_attr = false;
     bool marked_for_compilation = false;
-    if (GetNodeAttr(n->def(), kXlaCompileAttr, &compile_attr).ok()) {
+    if (GetNodeAttr(n->attrs(), kXlaCompileAttr, &compile_attr).ok()) {
       marked_for_compilation = compile_attr;
-    } else if (options.flib_def
-                   ->GetAttr(n->def(), kXlaCompileAttr, &compile_attr)
+    } else if (options.flib_def->GetAttr(*n, kXlaCompileAttr, &compile_attr)
                    .ok()) {
       marked_for_compilation = compile_attr;
     }
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 91e4a2b41c7026b6ca028ed6a7e61588d57e9e50..9f30e12e0e30fef6b4bcd0ea3c091842b008c29a 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -57,7 +57,7 @@ std::unordered_map<string, string> GetClusters(const Graph& graph) {
   std::unordered_map<string, string> ids;
   for (Node* node : graph.nodes()) {
     string cluster;
-    if (GetNodeAttr(node->def(), kXlaClusterAttr, &cluster).ok()) {
+    if (GetNodeAttr(node->attrs(), kXlaClusterAttr, &cluster).ok()) {
       CHECK(!cluster.empty());
       ids[node->name()] = cluster;
     }
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 41abea02eb2d17423744dfb719ee9a3f6b8f1198..63ca77f9a912acce2078f3da43d64f2e10049380 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -37,9 +37,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-XlaCompilationCache::XlaCompilationCache(const XlaCompiler::Options& options)
-    : compiler_(options) {}
-
+XlaCompilationCache::XlaCompilationCache(xla::Client* client,
+                                         DeviceType device_type)
+    : client_(client), device_type_(std::move(device_type)) {}
 XlaCompilationCache::~XlaCompilationCache() = default;
 
 string XlaCompilationCache::DebugString() {
@@ -95,7 +95,7 @@ Status XlaCompilationCache::BuildSignature(
     const NameAttrList& function, int num_constant_args,
     const std::vector<OptionalTensor>& variable_args, OpKernelContext* ctx,
     Signature* signature) {
-  signature->name = Canonicalize(function.name(), function.attr());
+  signature->name = Canonicalize(function.name(), AttrSlice(&function.attr()));
   signature->arg_values.resize(num_constant_args);
 
   signature->arg_types.reserve(ctx->num_inputs() - num_constant_args);
@@ -205,8 +205,9 @@ Status BuildArguments(int num_constant_args,
 }  // namespace
 
 Status XlaCompilationCache::Compile(
-    const NameAttrList& function, int num_constant_args,
-    const std::vector<OptionalTensor>& variable_args, OpKernelContext* ctx,
+    const XlaCompiler::Options& options, const NameAttrList& function,
+    int num_constant_args, const std::vector<OptionalTensor>& variable_args,
+    OpKernelContext* ctx,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable) {
   VLOG(1) << "XlaCompilationCache::Compile " << DebugString();
@@ -263,21 +264,18 @@ Status XlaCompilationCache::Compile(
     TF_RETURN_IF_ERROR(
         BuildArguments(num_constant_args, variable_args, ctx, &args));
 
-    std::unique_ptr<FunctionLibraryRuntime> flr(NewFunctionLibraryRuntime(
-        compiler_.device_mgr(), ctx->env(), compiler_.device(),
-        TF_GRAPH_DEF_VERSION,
-        ctx->function_library()->GetFunctionLibraryDefinition(),
-        OptimizerOptions(), nullptr /* custom_kernel_creator */));
-
+    XlaCompiler compiler(options);
     entry->compiled = true;
-    entry->compilation_status = compiler_.CompileFunction(
-        flr.get(), function, args, &entry->compilation_result);
+    entry->compilation_status =
+        compiler.CompileFunction(XlaCompiler::CompileOptions(), function, args,
+                                 &entry->compilation_result);
   }
   *compilation_result = &entry->compilation_result;
   if (entry->compilation_status.ok() && executable) {
     if (entry->executable == nullptr &&
-        !entry->compilation_result.computation.IsNull()) {
-      entry->compilation_status = compiler_.BuildExecutable(
+        !entry->compilation_result.computation->IsNull()) {
+      XlaCompiler compiler(options);
+      entry->compilation_status = compiler.BuildExecutable(
           entry->compilation_result, &entry->executable);
     }
     *executable = entry->executable.get();
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index ff67e48d1a9a9f16881c2e141b23ce8c479aef50..4ffcb68a3220b2354a3542e4c2a4d3e000969e0b 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -46,7 +46,7 @@ struct OptionalTensor {
 // bound.
 class XlaCompilationCache : public ResourceBase {
  public:
-  explicit XlaCompilationCache(const XlaCompiler::Options& options);
+  XlaCompilationCache(xla::Client* client, DeviceType device_type);
   ~XlaCompilationCache() override;
 
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
@@ -61,19 +61,21 @@ class XlaCompilationCache : public ResourceBase {
   // xla::LocalExecutable and sets `executable to point to it. The resulting
   // executable pointer may be null if the computation has no non-constant
   // outputs.
-  Status Compile(const NameAttrList& function, int num_constant_args,
+  Status Compile(const XlaCompiler::Options& options,
+                 const NameAttrList& function, int num_constant_args,
                  const std::vector<OptionalTensor>& variable_args,
                  OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** compilation_result,
                  xla::LocalExecutable** executable);
 
-  xla::Client* client() const { return compiler_.client(); }
+  xla::Client* client() const { return client_; }
+  const DeviceType& device_type() const { return device_type_; }
 
   string DebugString() override;
 
  private:
-  XlaCompiler compiler_;
-  std::unique_ptr<FunctionLibraryRuntime> function_library_runtime_;
+  xla::Client* const client_;
+  const DeviceType device_type_;
 
   // Describes the types, shapes and any compile-time constant arguments
   // to a kernel. Key that uniquely identifies a compilation output.
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 93f487c36ca5ca8f7e3930cf8f053367400d7920..5e336c5287bd9e2067e93cd8db8a5a1b62b62bd2 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -125,7 +125,7 @@ XlaDevice::XlaDevice(const SessionOptions& options,
                      const DeviceType& jit_device_name,
                      perftools::gputools::Platform* platform,
                      Allocator* xla_allocator)
-    : LocalDevice(options, attrs, xla_allocator),
+    : LocalDevice(options, attrs),
       device_ordinal_(device_ordinal),
       jit_device_name_(jit_device_name),
       xla_allocator_(xla_allocator),
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index a52239df252b2b556987fa9701f43047765c60de..8699006ebc5aacafd46046a7c3f093356f687280 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -63,30 +63,10 @@ class XlaDeviceDummyOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE),                \
                           PlaceholderOp);                                      \
                                                                                \
-  REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE),               \
-                          ControlTriggerOp);                                   \
-  REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE), EnterOp);              \
-  REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE), ExitOp);                \
-  REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE),                \
-                          NextIterationOp);                                    \
-  REGISTER_KERNEL_BUILDER(Name("Switch").Device(DEVICE).HostMemory("pred"),    \
-                          SwitchOp);                                           \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);        \
-  REGISTER_KERNEL_BUILDER(Name("LoopCond")                                     \
-                              .Device(DEVICE)                                  \
-                              .HostMemory("input")                             \
-                              .HostMemory("output"),                           \
-                          IdentityOp);                                         \
-                                                                               \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("VarHandleOp").Device(DEVICE).HostMemory("resource"),               \
       ResourceHandleOp<Var>);
 
-// TODO(b/32507444): the registrations for the control flow operators are
-// temporary and exist primarily to work around a bug in the graph partitioning
-// code.
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_DEVICE_OPS_H_
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 0592e3d4b1993d132aa955171c3b523af9869fee..19f7ff835456855a2b2ab7d5856f1d3e6f7f9733 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -65,6 +65,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "adam_test",
+    size = "small",
+    srcs = ["adam_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "binary_ops_test",
     size = "small",
@@ -156,6 +170,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "slice_ops_test",
+    size = "small",
+    srcs = ["slice_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "function_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py
index 0a2c9e26c6fbd827d5ab669dea5419f9fa50025b..a5c5885b4284aee167ae4cb18f7e42820c6d251d 100644
--- a/tensorflow/compiler/tests/adagrad_test.py
+++ b/tensorflow/compiler/tests/adagrad_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functional tests for aggregate operations."""
+"""Tests for Adagrad."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3215dc36e5b2d517aa951db1b0d41188185ef93a
--- /dev/null
+++ b/tensorflow/compiler/tests/adam_test.py
@@ -0,0 +1,176 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamOptimizerTest(XLATestCase):
+
+  def testBasic(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        variable_scope.get_variable_scope().set_use_resource(True)
+
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = array_ops.placeholder(dtype)
+        grads1 = array_ops.placeholder(dtype)
+        opt = adam.AdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        variable_scope.get_variable_scope().set_use_resource(True)
+
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = array_ops.placeholder(dtype)
+        grads1 = array_ops.placeholder(dtype)
+        opt = adam.AdamOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        variable_scope.get_variable_scope().set_use_resource(True)
+
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = array_ops.placeholder(dtype)
+        grads1 = array_ops.placeholder(dtype)
+        opt = adam.AdamOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+          else:
+            update2.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 7d91594db009f79475afc30ca4a8972b157806ee..2a71543f3febe3cb692fdcd563772c3bd2d3724a 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -94,7 +94,7 @@ class OpTestBuilder {
   explicit OpTestBuilder(const string& op_name);
 
   // Adds an input 'tensor'.
-  OpTestBuilder& Input(Tensor tensor);
+  OpTestBuilder& Input(const Tensor& tensor);
 
   // Sets an attribute.
   template <class T>
@@ -111,8 +111,8 @@ class OpTestBuilder {
   // sets it to the NodeDef of the operator under test. Fills 'inputs' and
   // 'outputs' with the names of the input placeholder nodes and the output
   // identity nodes, respectively.
-  Status BuildGraph(string name_prefix, string device, bool use_jit,
-                    GraphDef* graphdef, NodeDef** test_node_def,
+  Status BuildGraph(const string& name_prefix, const string& device,
+                    bool use_jit, GraphDef* graphdef, NodeDef** test_node_def,
                     std::vector<string>* inputs,
                     std::vector<string>* outputs) const;
 
@@ -127,7 +127,7 @@ OpTestBuilder::OpTestBuilder(const string& op_name) {
   node_def_.set_op(op_name);
 }
 
-OpTestBuilder& OpTestBuilder::Input(Tensor tensor) {
+OpTestBuilder& OpTestBuilder::Input(const Tensor& tensor) {
   VLOG(1) << "Adding input: " << tensor.DebugString();
   inputs_.push_back(tensor);
   return *this;
@@ -146,9 +146,9 @@ OpTestBuilder& OpTestBuilder::Attr(StringPiece attr_name,
   return *this;
 }
 
-Status OpTestBuilder::BuildGraph(string name_prefix, string device,
-                                 bool use_jit, GraphDef* graphdef,
-                                 NodeDef** test_node_def,
+Status OpTestBuilder::BuildGraph(const string& name_prefix,
+                                 const string& device, bool use_jit,
+                                 GraphDef* graphdef, NodeDef** test_node_def,
                                  std::vector<string>* inputs,
                                  std::vector<string>* outputs) const {
   OpRegistryInterface* op_registry = OpRegistry::Global();
@@ -209,7 +209,7 @@ class OpTest : public ::testing::Test {
 
   // Runs 'fn' up to --tf_xla_test_repetitions times, or until a failure occurs;
   // whichever happens first.
-  void Repeatedly(std::function<void(void)> fn);
+  void Repeatedly(const std::function<void(void)>& fn);
 
   // Select a random element from 'candidates'.
   template <typename T>
@@ -315,7 +315,7 @@ OpTest::OpTest() {
   TF_CHECK_OK(session_->Create(def));
 }
 
-void OpTest::Repeatedly(std::function<void(void)> fn) {
+void OpTest::Repeatedly(const std::function<void(void)>& fn) {
   int const max_repetitions = tf_xla_test_repetitions;
   for (int i = 0; !HasFailure() && i < max_repetitions; ++i) {
     fn();
diff --git a/tensorflow/compiler/tests/slice_ops_test.py b/tensorflow/compiler/tests/slice_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..795885f8302dbf41ef04e37b87abdd0d4bf12727
--- /dev/null
+++ b/tensorflow/compiler/tests/slice_ops_test.py
@@ -0,0 +1,142 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slicing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+class SliceTest(XLATestCase):
+
+  def test1D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        with self.test_scope():
+          o = array_ops.slice(i, [2], [4])
+        params = {
+          i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([2, 3, 4, 5], result)
+
+  def test3D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[3, 3, 10])
+        with self.test_scope():
+          o = array_ops.slice(i, [1, 2, 2], [1, 1, 4])
+        params = {
+          i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+               [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+               [5, 3, 1, 7, 9, 2, 4, 6, 8, 0]],
+              [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
+               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+               [8, 7, 6, 5, 4, 3, 2, 1, 8, 7]],
+              [[7, 5, 7, 5, 7, 5, 7, 5, 7, 5],
+               [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+               [9, 8, 7, 9, 8, 7, 9, 8, 7, 9]]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[[6, 5, 4, 3]]], result)
+
+class StridedSliceTest(XLATestCase):
+
+  def test1D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [2], [6], [2])
+        params = {
+          i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([2, 4], result)
+
+  def test1DNegtiveStride(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [6], [2], [-2])
+        params = {
+          i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([6, 4], result)
+
+  def test3D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[3, 3, 10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [0, 2, 2], [2, 3, 6], [1, 1, 2])
+        params = {
+          i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+               [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+               [5, 3, 1, 7, 9, 2, 4, 6, 8, 0]],
+              [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
+               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+               [8, 7, 6, 5, 4, 3, 2, 1, 8, 7]],
+              [[7, 5, 7, 5, 7, 5, 7, 5, 7, 5],
+               [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+               [9, 8, 7, 9, 8, 7, 9, 8, 7, 9]]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[[1, 9]],
+                             [[6, 4]]], result)
+
+  def test3DNegativeStride(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[3, 4, 10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [2, 2, 6], [0, 0, 2], [-1, -1, -2])
+        params = {
+          i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+               [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+               [5, 3, 1, 7, 9, 2, 4, 6, 8, 0],
+               [4, 5, 2, 4, 3, 7, 6, 8, 9, 4]],
+              [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
+               [4, 3, 4, 5, 7, 6, 5, 3, 4, 5],
+               [8, 7, 6, 5, 4, 3, 2, 1, 8, 7],
+               [7, 1, 7, 1, 8, 1, 8, 1, 3, 1]],
+              [[7, 5, 7, 5, 7, 5, 7, 5, 7, 5],
+               [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+               [9, 8, 7, 9, 8, 7, 9, 8, 7, 9],
+               [9, 9, 5, 5, 6, 6, 3, 3, 6, 6]]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[[9, 8],
+                              [1, 1]],
+                             [[2, 4],
+                              [5, 7]]], result)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index dcb9e2db2f8ca7ef6e89cb9c6493d15dcaacd46e..fef390fd67f38bc2b1a26cb2e80ffa4ca834d98d 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -36,6 +37,21 @@ from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
 class VariableOpsTest(XLATestCase):
   """Test cases for resource variable operators."""
 
+  def testOneWriteOneOutput(self):
+    # Regression test for a bug where computations with one non-constant
+    # output and one variable update were mishandled.
+    for dtype in self.numeric_types:
+      init = np.array([[1, 2], [3, 4]], dtype=dtype)
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        sess.run(variables.variables_initializer([v]))
+        p = array_ops.placeholder(dtype)
+        x = v.assign_add(p)
+        with ops.control_dependencies([x]):
+          y = v.read_value()
+        self.assertAllClose(np.array([[2, 3], [4, 5]], dtype=dtype),
+                            sess.run(y, {p: 1}))
+
   def testReadWrite(self):
     """Tests initialization, reading, and writing a resource variable."""
     with self.test_session() as session:
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index 1388a892ba5a1d07c05eedf277085099923ae901..f5c228f8305d740b994dadc34c93b4e0ae32d785 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -18,15 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -48,34 +43,6 @@ class XlaDeviceTest(test.TestCase):
       result = sess.run(w, {x: [1.5, 0.5]})
     self.assertAllClose(result, [12., 2.], rtol=1e-3)
 
-  def testLoops(self):
-    """Tests that loops work on XLA devices."""
-
-    with session_lib.Session() as session:
-      x = array_ops.placeholder(dtypes.float32)
-      with ops.device("device:XLA_CPU:0"):
-        c = lambda i, _: math_ops.less(i, 5)
-        b = lambda i, x: (i + 1, x * 2.0 + 1.0)
-        _, y = control_flow_ops.while_loop(c, b, (constant_op.constant(0), x))
-
-      result = session.run(y, {x: np.float32(2)})
-      self.assertAllClose(result, np.float32(95), rtol=1e-3)
-
-  def testCond(self):
-    """Tests that tf.cond works on XLA devices."""
-
-    with session_lib.Session() as session:
-      x = array_ops.placeholder(dtypes.float32)
-      y = array_ops.placeholder(dtypes.float32)
-      c = array_ops.placeholder(dtypes.bool)
-      with ops.device("device:XLA_CPU:0"):
-        z = x + 1.0
-        w = control_flow_ops.cond(c, lambda: z, lambda: y)
-        t = math_ops.add(z, w)
-
-      result = session.run(t, {x: np.float32(2), y: np.float32(4), c: True})
-      self.assertAllClose(result, np.float32(6), rtol=1e-3)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index 44ff13ca34e740b12f28d4952ab968472e5d1e57..4adc17b8382bd423264a693a09e2cec0803ad9cf 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -108,7 +108,7 @@ Status BackwardsConstAnalysis(const Graph& g,
     if (must_be_const.find(node) != must_be_const.end()) {
       if (node->type_string() == "_Arg") {
         int index;
-        status = GetNodeAttr(node->def(), "index", &index);
+        status = GetNodeAttr(node->attrs(), "index", &index);
         if (!status.ok()) return;
         compile_time_const_args->at(index) = true;
         return;
@@ -124,8 +124,8 @@ Status BackwardsConstAnalysis(const Graph& g,
     if (range.first == range.second) return;
 
     NameRangeMap input_name_ranges;
-    status = NameRangesForNode(node->def(), node->op_def(), &input_name_ranges,
-                               nullptr);
+    status =
+        NameRangesForNode(*node, node->op_def(), &input_name_ranges, nullptr);
     if (!status.ok()) return;
 
     for (auto it = range.first; it != range.second; ++it) {
diff --git a/tensorflow/compiler/tf2xla/kernels/function_ops.cc b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
index d718f98545f66cb79a77d758a3fb7ee486d87b4b..8dacb6627bde516c92cb07b747207adbe85ada5b 100644
--- a/tensorflow/compiler/tf2xla/kernels/function_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
@@ -68,7 +68,8 @@ class SymbolicGradientOp : public AsyncOpKernel {
                       done);
 
     OP_REQUIRES_OK_ASYNC(
-        ctx, lib->Instantiate(kGradientOp, def().attr(), &handle_), done);
+        ctx, lib->Instantiate(kGradientOp, AttrSlice(&def().attr()), &handle_),
+        done);
 
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
index eff23bd77d23afc882c67f8168270d1cb4413977..ef844cc6c5ae07a3e6331971023a280ee0cafe41 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
@@ -63,7 +64,7 @@ EIGEN_STRONG_INLINE void gather_float_int32_xla_impl(float* out, void** data) {
 
 // Implements gather on CPU. This is called by an XLA custom call, set up by
 // gather_op.cc.
-extern "C" void __attribute__((visibility("default")))
+extern "C" void TF_EXPORT
 gather_float_int32_xla_impl(float* out, void** data) {
   tensorflow::gather_float_int32_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
index ae31f6f2006959c03941a1eb04b31aecf52424b0..4c8693d1976bf0817a01c2bacbbf4708202ce51e 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
@@ -63,7 +64,7 @@ EIGEN_STRONG_INLINE void gather_float_int64_xla_impl(float* out, void** data) {
 
 // Implements gather on CPU. This is called by an XLA custom call, set up by
 // gather_op.cc.
-extern "C" void __attribute__((visibility("default")))
+extern "C" void TF_EXPORT
 gather_float_int64_xla_impl(float* out, void** data) {
   tensorflow::gather_float_int64_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
index 0033a949a372684caadce70bf46a996a942e9ec4..a71f2fcf0f7755d4e9ed2a9fd8b50a2e07bcfd2f 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -43,7 +44,7 @@ EIGEN_STRONG_INLINE void argmax_float_1d_xla_impl(void* out, void** data) {
 
 // Implements argmax on CPU. This is called by an XLA custom call, set up by
 // index_ops.cc.
-extern "C" void __attribute__((visibility("default")))
+extern "C" void TF_EXPORT
 argmax_float_1d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_1d_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
index be8ad2317c9ba6a39f839c4a535440fb94365aa9..f30eb6121fc858c50b9c00255e86105fe8ebcc54 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -45,7 +46,7 @@ EIGEN_STRONG_INLINE void argmax_float_2d_xla_impl(void* out, void** data) {
 
 // Implements argmax on CPU. This is called by an XLA custom call, set up by
 // index_ops.cc.
-extern "C" void __attribute__((visibility("default")))
+extern "C" void TF_EXPORT
 argmax_float_2d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_2d_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 03e02299e33a4e2bf62e757b2092db35288b0bea..bbe157bbeac56a396d946685c164867194accb42 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -77,11 +77,9 @@ class StridedSliceOp : public XlaOpKernel {
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
     gtl::InlinedVector<int64, 4> slice_begin, slice_end;
+    bool simple_strides = true;
     for (int i = 0; i < begin.size(); ++i) {
-      // TODO(phawkins): implement strides != 1 when b/30878775 is fixed.
-      OP_REQUIRES(
-          ctx, strides[i] == 1 || strides[i] == -1,
-          errors::Unimplemented("Strides != 1 or -1 are not yet implemented"));
+      simple_strides &= (std::abs(strides[i]) == 1);
       if (strides[i] > 0) {
         slice_begin.push_back(begin[i]);
         slice_end.push_back(end[i]);
@@ -99,6 +97,36 @@ class StridedSliceOp : public XlaOpKernel {
       slice = ctx->builder()->Rev(slice, dimensions_to_reverse);
     }
 
+    // If at least one of the strides is > 1 (or < -1) then use Slice
+    // to pull out each of the strided slices, and Concat to put them
+    // together again.
+    if (!simple_strides) {
+
+      // Re-adjust the begin and end now that the periphery has been
+      // sliced away.
+      for (int d = 0; d < strides.size(); ++d) {
+        slice_end[d] -= slice_begin[d];
+        slice_begin[d] = 0;
+      }
+
+      for (int d = 0; d < strides.size(); ++d) {
+        int64 stride = std::abs(strides[d]);
+        if (stride > 1) {
+          std::vector<xla::ComputationDataHandle> to_concat;
+          int64 end = slice_end[d];
+          for (int64 i = 0; i < end; i += stride) {
+            slice_begin[d] = i;
+            slice_end[d] = i+1;
+            to_concat.push_back(ctx->builder()->Slice(slice, slice_begin,
+                                                      slice_end));
+          }
+          slice = ctx->builder()->ConcatInDim(to_concat, d);
+          slice_begin[d] = 0;
+          slice_end[d] = to_concat.size();
+        }
+      }
+    }
+
     slice = ctx->builder()->Reshape(slice, final_shape.dim_sizes());
     ctx->SetOutput(0, slice);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index f1d81f871423b220c6859c1dedf79b1c36a43e65..ddd81cb490cd76065735a5b7e78d04fd76c05f82 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -165,6 +165,106 @@ class ResourceApplyAdagrad : public XlaOpKernel {
 };
 REGISTER_XLA_OP(Name("ResourceApplyAdagrad"), ResourceApplyAdagrad);
 
+class ResourceApplyAdam : public XlaOpKernel {
+ public:
+  explicit ResourceApplyAdam(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType var_type, m_type, v_type;
+    TensorShape var_shape, m_shape, v_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &m_type, &m_shape));
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &v_type, &v_shape));
+
+    OP_REQUIRES(
+        ctx, dtype_ == var_type && dtype_ == m_type && dtype_ == v_type,
+        errors::InvalidArgument(
+            "Types of variable arguments to ResourceApplyRMSProp must match: ",
+            DataTypeString(dtype_), " vs. ", DataTypeString(var_type), " vs. ",
+            DataTypeString(m_type), " vs. ", DataTypeString(v_type)));
+
+    TensorShape beta1_power_shape = ctx->InputShape(3);
+    TensorShape beta2_power_shape = ctx->InputShape(4);
+    TensorShape lr_shape = ctx->InputShape(5);
+    TensorShape beta1_shape = ctx->InputShape(6);
+    TensorShape beta2_shape = ctx->InputShape(7);
+    TensorShape epsilon_shape = ctx->InputShape(8);
+    TensorShape grad_shape = ctx->InputShape(9);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power_shape),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power_shape),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_shape),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_shape),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_shape),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon_shape.DebugString()));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(m_shape),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var_shape.DebugString(), " ",
+                                        m_shape.DebugString()));
+    OP_REQUIRES(ctx, var_shape.IsSameSize(v_shape),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var_shape.DebugString(), " ",
+                                        v_shape.DebugString()));
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    xla::ComputationDataHandle var, m, v;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &m));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &v));
+    xla::ComputationDataHandle beta1_power = ctx->Input(3);
+    xla::ComputationDataHandle beta2_power = ctx->Input(4);
+    xla::ComputationDataHandle lr = ctx->Input(5);
+    xla::ComputationDataHandle beta1 = ctx->Input(6);
+    xla::ComputationDataHandle beta2 = ctx->Input(7);
+    xla::ComputationDataHandle epsilon = ctx->Input(8);
+    xla::ComputationDataHandle grad = ctx->Input(9);
+
+    // alpha <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+    // m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+    // v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+    // variable <- variable - alpha * m_t / (sqrt(v_t) + epsilon)
+
+    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationDataHandle half = XlaHelpers::FloatLiteral(b, dtype_, 0.5);
+    xla::ComputationDataHandle one = XlaHelpers::FloatLiteral(b, dtype_, 1.0);
+    xla::ComputationDataHandle two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
+
+    xla::ComputationDataHandle alpha =
+        b->Div(b->Mul(lr, b->Pow(b->Sub(one, beta2_power), half)),
+               b->Sub(one, beta1_power));
+    m = b->Add(m, b->Mul(b->Sub(grad, m), b->Sub(one, beta1)));
+    v = b->Add(v, b->Mul(b->Sub(b->Pow(grad, two), v), b->Sub(one, beta2)));
+    var =
+        b->Sub(var, b->Div(b->Mul(m, alpha), b->Add(b->Pow(v, half), epsilon)));
+
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, m));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, dtype_, v));
+  }
+
+ private:
+  DataType dtype_;
+};
+REGISTER_XLA_OP(Name("ResourceApplyAdam"), ResourceApplyAdam);
+
 class ResourceApplyRMSProp : public XlaOpKernel {
  public:
   explicit ResourceApplyRMSProp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
diff --git a/tensorflow/compiler/tf2xla/str_util.cc b/tensorflow/compiler/tf2xla/str_util.cc
index ce25d631271b54a36078cd0d3ac4d318d58db9fa..2b0834fe7b6c4d2199267dbe0ec1f7c2785aa9c7 100644
--- a/tensorflow/compiler/tf2xla/str_util.cc
+++ b/tensorflow/compiler/tf2xla/str_util.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace str_util {
 
-void ReplaceAll(string* text, StringPiece from, StringPiece to) {
+static void ReplaceAll(string* text, StringPiece from, StringPiece to) {
   size_t pos = 0;
   while ((pos = text->find(from.data(), pos, from.size())) != string::npos) {
     text->replace(pos, from.size(), to.data(), to.size());
diff --git a/tensorflow/compiler/tf2xla/str_util.h b/tensorflow/compiler/tf2xla/str_util.h
index 4920b1a4d4875192d6f06988b810ad388bc6293b..51f25009d7003db0d72296619a469ecbbbb1808d 100644
--- a/tensorflow/compiler/tf2xla/str_util.h
+++ b/tensorflow/compiler/tf2xla/str_util.h
@@ -29,10 +29,6 @@ limitations under the License.
 namespace tensorflow {
 namespace str_util {
 
-// Replace all non-overlapping occurrences of from with to in-place in text.  If
-// from is empty, it matches at the beginning of the text and after every byte.
-void ReplaceAll(string* text, StringPiece from, StringPiece to);
-
 // Replace all non-overlapping occurrences of the given (from,to) pairs in-place
 // in text.  If from is empty, it matches at the beginning of the text and after
 // every byte.  Each (from,to) replacement pair is processed in the order it is
diff --git a/tensorflow/compiler/tf2xla/str_util_test.cc b/tensorflow/compiler/tf2xla/str_util_test.cc
index f992007a34532157f86c90c717a5e24c3923f22d..8817f6902a8e58e796ca5240a9a24d7506d38793 100644
--- a/tensorflow/compiler/tf2xla/str_util_test.cc
+++ b/tensorflow/compiler/tf2xla/str_util_test.cc
@@ -25,36 +25,6 @@ limitations under the License.
 namespace tensorflow {
 namespace str_util {
 
-class ReplaceAllTest : public ::testing::Test {
- protected:
-  void ExpectReplaceAll(string text, StringPiece from, StringPiece to,
-                        StringPiece want) {
-    ReplaceAll(&text, from, to);
-    EXPECT_EQ(text, want);
-  }
-};
-
-TEST_F(ReplaceAllTest, Simple) {
-  ExpectReplaceAll("", "", "", "");
-  ExpectReplaceAll("", "", "X", "X");
-  ExpectReplaceAll("", "", "XYZ", "XYZ");
-  ExpectReplaceAll("banana", "", "", "banana");
-  ExpectReplaceAll("banana", "", "_", "_b_a_n_a_n_a_");
-  ExpectReplaceAll("banana", "", "__", "__b__a__n__a__n__a__");
-  ExpectReplaceAll("banana", "a", "a", "banana");
-  ExpectReplaceAll("banana", "a", "", "bnn");
-  ExpectReplaceAll("banana", "a", "X", "bXnXnX");
-  ExpectReplaceAll("banana", "a", "XX", "bXXnXXnXX");
-  ExpectReplaceAll("banana", "an", "an", "banana");
-  ExpectReplaceAll("banana", "an", "", "ba");
-  ExpectReplaceAll("banana", "an", "X", "bXXa");
-  ExpectReplaceAll("banana", "an", "XY", "bXYXYa");
-  ExpectReplaceAll("banana", "an", "XYZ", "bXYZXYZa");
-  ExpectReplaceAll("foo {{bar}} baz {{bar}}", "{{bar}}", "X", "foo X baz X");
-  ExpectReplaceAll("foo {{bar}} baz {{bar}}", "{{bar}}", "ABCDEFGHIJKLMNOP",
-                   "foo ABCDEFGHIJKLMNOP baz ABCDEFGHIJKLMNOP");
-}
-
 class ReplaceAllPairsTest : public ::testing::Test {
  protected:
   void ExpectReplaceAllPairs(
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index d86e741b69e08652bac2dd7b5295c8ab2d94433a..362a1018955f9b6adbdea5ba718b81e9a2389957 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -76,8 +76,7 @@ XlaCompilationDevice::XlaCompilationDevice(const SessionOptions& options,
           options,
           Device::BuildDeviceAttributes(
               "", type, Bytes(256 << 20), DeviceLocality(),
-              strings::StrCat("device: XLA compilation device ", type.type())),
-          cpu_allocator()),
+              strings::StrCat("device: XLA compilation device ", type.type()))),
       allocator_(new XlaCompilationAllocator()) {}
 
 XlaCompilationDevice::~XlaCompilationDevice() {}
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 33b4a43aa1544f883d4242148ce77eebb8a4c54c..d4a917671b9cb9031e04d6840625b034720934c7 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -57,16 +57,37 @@ Status CheckSignature(const DataTypeVector& types,
 
 }  // namespace
 
+bool XlaCompiler::Argument::operator==(
+    const XlaCompiler::Argument& other) const {
+  if (std::tie(kind, type, shape, name) !=
+      std::tie(other.kind, other.type, other.shape, other.name)) {
+    return false;
+  }
+  if (constant_value.shape() != other.constant_value.shape()) {
+    return false;
+  }
+  return constant_value.tensor_data() == other.constant_value.tensor_data();
+}
+
 XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(std::move(options)),
       initialization_status_(Status::OK()),
       next_step_id_(1),
-      device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)),
+      device_(
+          new XlaCompilationDevice(SessionOptions(), *options_.device_type)),
       device_mgr_({device_}) {
+  // We no longer need the device_type.
+  options_.device_type = nullptr;
+
   if (options_.populate_resource_manager) {
     initialization_status_ =
         (*options_.populate_resource_manager)(device_->resource_manager());
   }
+
+  flib_runtime_.reset(NewFunctionLibraryRuntime(
+      &device_mgr_, Env::Default(), device_, options.graph_def_version,
+      options.flib_def, OptimizerOptions(),
+      nullptr /* custom_kernel_creator */));
 }
 
 XlaCompiler::~XlaCompiler() = default;
@@ -76,37 +97,35 @@ int64 XlaCompiler::NextStepId() {
   return next_step_id_++;
 }
 
-// Prunes any nodes from a function that are not dependencies of the _Retval
-// nodes. Used to prune stateful ops from within a function body, such as
-// variable initializers, that should not be executed unless requested.
-static void PruneUnreachableNodes(Graph* graph) {
-  std::unordered_set<const Node*> nodes;
-  for (Node* node : graph->nodes()) {
-    if (node->type_string() == "_Retval" ||
-        StringPiece(node->type_string()).ends_with("Send")) {
-      nodes.insert(node);
-    }
-  }
-  PruneForReverseReachability(graph, nodes);
+uint64 XlaCompiler::SignatureHash::operator()(
+    const std::pair<string, std::vector<Argument>>& signature) const {
+  return std::hash<string>()(signature.first);
 }
 
 Status XlaCompiler::CompileFunction(
-    FunctionLibraryRuntime* flr, const NameAttrList& function,
+    const XlaCompiler::CompileOptions& options, const NameAttrList& function,
     const std::vector<XlaCompiler::Argument>& args,
     XlaCompiler::CompilationResult* result) {
-  const string function_id = Canonicalize(function.name(), function.attr());
+  const string function_id =
+      Canonicalize(function.name(), AttrSlice(&function.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
 
+  auto it = cache_.find({function_id, args});
+  if (it != cache_.end()) {
+    *result = it->second;
+    return Status::OK();
+  }
+
   FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(
-      flr->Instantiate(function.name(), function.attr(), &handle));
+  TF_RETURN_IF_ERROR(flib_runtime_->Instantiate(
+      function.name(), AttrSlice(&function.attr()), &handle));
 
-  const FunctionBody* fbody = flr->GetFunctionBody(handle);
+  const FunctionBody* fbody = flib_runtime_->GetFunctionBody(handle);
   CHECK(fbody);
 
   TF_RETURN_IF_ERROR(CheckSignature(fbody->arg_types, args));
 
-  std::unique_ptr<Graph> graph(new Graph(flr->GetFunctionLibraryDefinition()));
+  std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
 
   if (VLOG_IS_ON(1)) {
@@ -115,11 +134,13 @@ Status XlaCompiler::CompileFunction(
   }
 
   // Optimize the graph before running the compiler.
-  // TODO(pbar): The constant folder currently does not simplify int32
-  // operations for devices other than CPU.
   OptimizerOptions opts;
+  opts.set_do_common_subexpression_elimination(true);
+  opts.set_do_function_inlining(true);
+  opts.set_do_constant_folding(true);
   GraphOptimizer optimizer(opts);
-  OptimizeGraph(flr, &graph);
+  optimizer.Optimize(flib_runtime_.get(), flib_runtime_->env(),
+                     /*device=*/nullptr, &graph);
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile(
@@ -129,9 +150,10 @@ Status XlaCompiler::CompileFunction(
 
   VLOG(1) << "====================================================";
   TF_RETURN_IF_ERROR(
-      CompileGraph(function_id, std::move(graph), flr, args, result));
+      CompileGraph(options, function_id, std::move(graph), args, result));
   VLOG(1) << "====================================================";
 
+  cache_[{function_id, args}] = *result;
   return Status::OK();
 }
 
@@ -158,7 +180,7 @@ Status XlaCompiler::BuildExecutable(
   build_options.set_has_hybrid_result(
       options_.local_executable_has_hybrid_result);
 
-  auto compile_result = local_client->Compile(result.computation,
+  auto compile_result = local_client->Compile(*result.computation,
                                               argument_layouts, build_options);
   if (!compile_result.ok()) {
     return compile_result.status();
@@ -378,9 +400,9 @@ Status BuildComputation(
 
 }  // namespace
 
-Status XlaCompiler::CompileGraph(string const& name,
+Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
+                                 string const& name,
                                  std::unique_ptr<Graph> graph,
-                                 FunctionLibraryRuntime* flib,
                                  const std::vector<XlaCompiler::Argument>& args,
                                  CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate ComputationBuilder.";
@@ -394,31 +416,29 @@ Status XlaCompiler::CompileGraph(string const& name,
                      options_.resolve_compile_time_constants);
   core::ScopedUnref context_unref(context);
 
-  result->tuple_arg = options_.use_tuple_arg;
+  result->tuple_arg = options.use_tuple_arg;
 
   std::vector<XlaContext::Argument> context_args;
-  TF_RETURN_IF_ERROR(BuildArguments(args, options_.use_tuple_arg, &builder,
+  TF_RETURN_IF_ERROR(BuildArguments(args, options.use_tuple_arg, &builder,
                                     &context_args, &result->input_mapping,
                                     &result->xla_input_shapes));
   context->set_args(std::move(context_args));
 
-  if (options_.prune_unreachable_nodes) {
-    PruneUnreachableNodes(graph.get());
-  }
-
-  TF_RETURN_IF_ERROR(
-      ExecuteGraph(context, std::move(graph), device_, flib, NextStepId()));
+  TF_RETURN_IF_ERROR(ExecuteGraph(context, std::move(graph), device_,
+                                  flib_runtime_.get(), NextStepId()));
 
   int num_nonconst_outputs;
+  result->computation = std::make_shared<xla::Computation>();
   TF_RETURN_IF_ERROR(BuildComputation(
       context->retvals(), context->variables(), context->has_side_effects(),
-      options_.return_updated_values_for_all_variables, &builder,
-      &result->computation, &num_nonconst_outputs, &result->variable_updates));
+      options.return_updated_values_for_all_variables, &builder,
+      result->computation.get(), &num_nonconst_outputs,
+      &result->variable_updates));
 
   result->requires_runtime_context = context->has_context_parameter();
 
   // Tuple arguments and runtime context parameters are incompatible.
-  CHECK(!(options_.use_tuple_arg && result->requires_runtime_context));
+  CHECK(!(options.use_tuple_arg && result->requires_runtime_context));
 
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
@@ -434,13 +454,13 @@ Status XlaCompiler::CompileGraph(string const& name,
     }
   }
 
-  if (result->computation.IsNull()) {
+  if (result->computation->IsNull()) {
     return Status::OK();
   }
 
   // Compute the output shapes, if there is a computation with non-constant
   // outputs.
-  auto computation_shape = client()->GetComputationShape(result->computation);
+  auto computation_shape = client()->GetComputationShape(*result->computation);
   if (!computation_shape.ok()) {
     return computation_shape.status();
   }
@@ -472,10 +492,10 @@ Status XlaCompiler::CompileGraph(string const& name,
        i < context->retvals().size(); ++i) {
     const XlaContext::HandleOrConstant& retval = context->retvals()[i];
     if (!retval.is_constant) {
-      CHECK_LT(computation_output, num_nonconst_outputs);
+      CHECK_LT(computation_output, num_computation_outputs);
       OutputDescription& output = result->outputs[i];
       output.is_constant = false;
-      if (num_nonconst_outputs > 1) {
+      if (num_computation_outputs > 1) {
         output.shape =
             XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(
                 result->xla_output_shape, computation_output));
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 3d28ca374609df28647d243544dcbf8cbf33e706..15f723ad782376b99ae7d72a5f15129e7880e9b1 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/notification.h"
@@ -112,6 +113,8 @@ class XlaCompiler {
 
     // The name of this argument, used for debugging.
     string name;
+
+    bool operator==(const Argument& other) const;
   };
 
   struct OutputDescription {
@@ -172,15 +175,22 @@ class XlaCompiler {
 
     // The XLA computation built from the tensorflow subgraph. May be null
     // if the output consists solely of compile-time constants.
-    xla::Computation computation;
+    std::shared_ptr<xla::Computation> computation;
   };
 
   struct Options {
-    // Name of the compilation device to use.
-    DeviceType device_type = DeviceType("");
+    // Name of the compilation device to use. Needs to be live only during
+    // XlaCompiler's constructor.
+    const DeviceType* device_type = nullptr;
 
     xla::Client* client = nullptr;
 
+    // Function library in which to find function definitions. Must be non-null.
+    const FunctionLibraryDefinition* flib_def = nullptr;
+
+    // The graph def version to be compiled.
+    int graph_def_version = TF_GRAPH_DEF_VERSION;
+
     // If 'allow_cpu_custom_calls' is true, kernels may make use of CustomCall()
     // for CPU; additionally, an optional XlaLocalRuntimeContext* may be passed
     // to the computation.
@@ -198,6 +208,19 @@ class XlaCompiler {
     // computation.
     bool resolve_compile_time_constants = true;
 
+    // If not nullptr, populate_resource_manager is called with the
+    // compilation device's resource manager when the compilation
+    // device is created, and can be used to create metadata objects
+    // that can be accessed by XLA op kernels.
+    std::function<Status(ResourceMgr*)>* populate_resource_manager = nullptr;
+  };
+
+  explicit XlaCompiler(Options options);
+  ~XlaCompiler();
+
+  // Options pertaining to an individual call to CompileGraph() or
+  // CompileFunction().
+  struct CompileOptions {
     // If `use_tuple_arg` is true, a single tuple parameter will be used for all
     // arguments; if false, each argument gets its own parameter.
     bool use_tuple_arg = false;
@@ -208,23 +231,8 @@ class XlaCompiler {
     // modified by the computation. Used when compiling loop bodies to ensure
     // the input and output signatures match.
     bool return_updated_values_for_all_variables = false;
-
-    // If 'prune_unreachable_nodes' is true, then nodes that are not
-    // dependencies of graph's _Retval nodes will be pruned before compilation.
-    // This is useful to prune stateful operators that should not be executed
-    // from a function body.
-    bool prune_unreachable_nodes = false;
-
-    // If not nullptr, populate_resource_manager is called with the
-    // compilation device's resource manager when the compilation
-    // device is created, and can be used to create metadata objects
-    // that can be accessed by XLA op kernels.
-    std::function<Status(ResourceMgr*)>* populate_resource_manager = nullptr;
   };
 
-  explicit XlaCompiler(Options options);
-  ~XlaCompiler();
-
   // Compiles a Tensorflow function `fn_name_attrs` into an XLA computation.
   // `args` describes the arguments to the function, each of which must either
   // be a runtime-parameter to the XLA computation, a compile-time constant, or
@@ -235,7 +243,7 @@ class XlaCompiler {
   // arguments are returned as host memory tensors in the output list and are
   // not included in the XLA computation's outputs. The XLA computation is
   // null if there are no data-dependent outputs and no side effects.
-  Status CompileFunction(FunctionLibraryRuntime* flr,
+  Status CompileFunction(const CompileOptions& options,
                          const NameAttrList& fn_name_attrs,
                          const std::vector<Argument>& args,
                          CompilationResult* result);
@@ -243,8 +251,8 @@ class XlaCompiler {
   // Compiles a tensorflow::Graph into an xla::Computation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
-  Status CompileGraph(string const& name, std::unique_ptr<Graph> graph,
-                      FunctionLibraryRuntime* flr,
+  Status CompileGraph(const CompileOptions& options, string const& name,
+                      std::unique_ptr<Graph> graph,
                       const std::vector<Argument>& args,
                       CompilationResult* result);
 
@@ -257,6 +265,7 @@ class XlaCompiler {
   xla::Client* client() const { return options_.client; }
   XlaCompilationDevice* device() const { return device_; }
   const DeviceMgr* device_mgr() const { return &device_mgr_; }
+  FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_.get(); }
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
@@ -281,6 +290,17 @@ class XlaCompiler {
   XlaCompilationDevice* device_;  // Owned by device_mgr_
   DeviceMgr device_mgr_;
 
+  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
+
+  struct SignatureHash {
+    uint64 operator()(
+        const std::pair<string, std::vector<Argument>>& signature) const;
+  };
+
+  std::unordered_map<std::pair<string, std::vector<Argument>>,
+                     CompilationResult, SignatureHash>
+      cache_;
+
   std::unordered_map<string, xla::ChannelHandle> channels_ GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 1cc7f4abd15798b29fe065c65c618b0166007b7e..58d74057d101cdef89fca24ec6c0858291d825fa 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -96,6 +96,8 @@ REGISTER_XLA_OP(Name("DummyReadResource"), DummyReadResourceOp);
 
 class XlaCompilerTest : public ::testing::Test {
  protected:
+  XlaCompilerTest() : cpu_device_type_(DEVICE_CPU_XLA_JIT) {}
+
   void SetUp() override {
     client_ = xla::ClientLibrary::LocalClientOrDie();
 
@@ -107,19 +109,13 @@ class XlaCompilerTest : public ::testing::Test {
 
   XlaCompiler::Options DefaultOptions() {
     XlaCompiler::Options options;
-    options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+    options.device_type = &cpu_device_type_;
     options.client = client_;
+    options.flib_def = flib_def_.get();
     return options;
   }
 
-  std::unique_ptr<FunctionLibraryRuntime> BuildFunctionLibraryRuntime(
-      const XlaCompiler& compiler) {
-    return std::unique_ptr<FunctionLibraryRuntime>(NewFunctionLibraryRuntime(
-        compiler.device_mgr(), /*env=*/nullptr, compiler.device(),
-        TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
-        /*custom_kernel_creator=*/nullptr));
-  }
-
+  DeviceType cpu_device_type_;
   xla::Client* client_;
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 };
@@ -127,15 +123,15 @@ class XlaCompilerTest : public ::testing::Test {
 // Tests compilation of an empty graph.
 TEST_F(XlaCompilerTest, EmptyReturnValues) {
   XlaCompiler compiler(DefaultOptions());
-  auto flr = BuildFunctionLibraryRuntime(compiler);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(compiler.CompileGraph("add", std::move(graph), flr.get(),
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
+                                     std::move(graph),
                                      /*args=*/{}, &result));
 
   // No computation should be generated.
-  EXPECT_EQ(0, result.computation.handle().handle());
+  EXPECT_EQ(0, result.computation->handle().handle());
 }
 
 // Tests compilation and execution of a graph that adds two tensors.
@@ -160,11 +156,10 @@ TEST_F(XlaCompilerTest, Simple) {
 
   // Compiles the graph.
   XlaCompiler compiler(DefaultOptions());
-  auto flr = BuildFunctionLibraryRuntime(compiler);
 
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(
-      compiler.CompileGraph("add", std::move(graph), flr.get(), args, &result));
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
+                                     std::move(graph), args, &result));
 
   // Tests that the generated computation works.
   std::unique_ptr<xla::Literal> param0_literal =
@@ -178,7 +173,7 @@ TEST_F(XlaCompilerTest, Simple) {
 
   std::unique_ptr<xla::GlobalData> actual =
       client_
-          ->Execute(result.computation, {param0_data.get(), param1_data.get()})
+          ->Execute(*result.computation, {param0_data.get(), param1_data.get()})
           .ConsumeValueOrDie();
   std::unique_ptr<xla::Literal> actual_literal =
       client_->Transfer(*actual).ConsumeValueOrDie();
@@ -213,14 +208,14 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     XlaCompiler::Options options = DefaultOptions();
     options.resolve_compile_time_constants = true;
     XlaCompiler compiler(options);
-    auto flr = BuildFunctionLibraryRuntime(compiler);
 
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
 
     XlaCompiler::CompilationResult result;
-    TF_ASSERT_OK(compiler.CompileGraph("constants", std::move(graph_copy),
-                                       flr.get(), args, &result));
+    TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                       "constants", std::move(graph_copy), args,
+                                       &result));
 
     ASSERT_EQ(2, result.outputs.size());
     EXPECT_TRUE(result.outputs[0].is_constant);
@@ -235,7 +230,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
     std::unique_ptr<xla::GlobalData> actual =
-        client_->Execute(result.computation, {param0_data.get()})
+        client_->Execute(*result.computation, {param0_data.get()})
             .ConsumeValueOrDie();
     std::unique_ptr<xla::Literal> actual_literal =
         client_->Transfer(*actual).ConsumeValueOrDie();
@@ -250,14 +245,14 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     XlaCompiler::Options options = DefaultOptions();
     options.resolve_compile_time_constants = false;
     XlaCompiler compiler(options);
-    auto flr = BuildFunctionLibraryRuntime(compiler);
 
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
 
     XlaCompiler::CompilationResult result;
-    TF_ASSERT_OK(compiler.CompileGraph("constants", std::move(graph_copy),
-                                       flr.get(), args, &result));
+    TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                       "constants", std::move(graph_copy), args,
+                                       &result));
 
     ASSERT_EQ(2, result.outputs.size());
     EXPECT_FALSE(result.outputs[0].is_constant);
@@ -270,7 +265,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
     std::unique_ptr<xla::GlobalData> actual =
-        client_->Execute(result.computation, {param0_data.get()})
+        client_->Execute(*result.computation, {param0_data.get()})
             .ConsumeValueOrDie();
     std::unique_ptr<xla::Literal> actual_literal =
         client_->Transfer(*actual).ConsumeValueOrDie();
@@ -312,13 +307,12 @@ TEST_F(XlaCompilerTest, ResourceManager) {
       };
   options.populate_resource_manager = &populate_function;
   XlaCompiler compiler(options);
-  auto flr = BuildFunctionLibraryRuntime(compiler);
 
   EXPECT_EQ(0, resource->Get());
 
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(compiler.CompileGraph("dummy", std::move(graph), flr.get(), args,
-                                     &result));
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "dummy",
+                                     std::move(graph), args, &result));
 
   EXPECT_EQ(1, resource->Get());
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 10d8b67bbd2d0e897e3ca55e584f575448a3a4fd..f8589edafc401bb511774ae3fede67f121efbcd7 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
@@ -89,7 +90,9 @@ xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
     case xla::U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case xla::F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      literal = *xla::LiteralUtil::CreateR0<xla::half>(
+              static_cast<xla::half>(value));
+      break;
     case xla::TUPLE:
       LOG(FATAL) << "tuple element type is not integral";
     case xla::OPAQUE:
@@ -107,6 +110,9 @@ xla::ComputationDataHandle XlaHelpers::FloatLiteral(xla::ComputationBuilder* b,
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   switch (type) {
+    case xla::F16:
+      return b->ConstantR0<xla::half>(static_cast<xla::half>(value));
+      break;
     case xla::F32:
       return b->ConstantR0<float>(static_cast<float>(value));
       break;
diff --git a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h b/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
index cd773d64ed4154aa2a05ac2d15e9358614239b1f..dca420d6ee3fec45f88ac3b450ab0cb4fb83d38a 100644
--- a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
+++ b/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
@@ -23,7 +23,7 @@ limitations under the License.
 // actually used.  E.g. some ahead-of-time compiled computations don't need a
 // thread pool.
 namespace Eigen {
-class ThreadPoolDevice;
+struct ThreadPoolDevice;
 }
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index a022de36a26d5f85e11b11ccd8dba4760aa8552f..48831ce4c27dfc644e8cd821e04cce3639ec0af5 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -379,8 +379,8 @@ void XlaOpKernelContext::SetOpHasSideEffects() {
   XlaContext::Get(context_).AddSideEffects();
 }
 
-const XlaCompiler::Options& XlaOpKernelContext::GetCompilerOptions() const {
-  return XlaContext::Get(context_).compiler()->options();
+XlaCompiler* XlaOpKernelContext::compiler() const {
+  return XlaContext::Get(context_).compiler();
 }
 
 void XlaOpKernelContext::CtxFailure(Status s) { context_->CtxFailure(s); }
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index f97e07bea5d13a8b5c65cbd378aba5a2a76d70d9..0a8a9284186e5b72a8a376ad159eb7b2482699c5 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -186,10 +186,9 @@ class XlaOpKernelContext {
   // Returns the underlying OpKernelContext. Use rarely.
   OpKernelContext* op_kernel_context() const { return context_; }
 
-  // Returns the options passed to the XlaCompiler that is being
-  // run. Used for, e.g., While to inherit options needed for nested
-  // computation.
-  const XlaCompiler::Options& GetCompilerOptions() const;
+  // Returns the XlaCompiler that is performing the compilation. Used for, e.g.,
+  // While to compile nested computations.
+  XlaCompiler* compiler() const;
 
   // TODO(phawkins): find a better home for these helpers.
 
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 7576dff0cd701e06a46ee5a809f376c455fe391e..de09d4b23f8d8b140bbb37f32d651f3cede897ec 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -216,7 +216,7 @@ cc_test(
         ":test_helpers",
         ":types",
         ":util",
-        "//tensorflow/core:test",
+        ":xla_data_proto",
         "//tensorflow/core:test_main",
     ],
 )
@@ -256,6 +256,7 @@ cc_library(
         ":array3d",
         ":array4d",
         ":shape_util",
+        ":status_macros",
         ":types",
         ":util",
         ":xla_data_proto",
@@ -274,6 +275,7 @@ cc_test(
         ":test",
         ":types",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 3e9dfe2a922c913c528d586413c11e2da8cbdc39..2d96128e259da316a41e83bea221ae201ad88a13 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -99,6 +99,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "compile_only_client",
+    srcs = ["compile_only_client.cc"],
+    hdrs = ["compile_only_client.h"],
+    deps = [
+        ":client",
+        ":computation",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:compile_only_service",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "@llvm//:support",
+    ],
+)
+
 # This target is used to instantiate the XLA service in-process and create
 # a client for it.
 cc_library(
@@ -106,12 +126,14 @@ cc_library(
     srcs = ["client_library.cc"],
     hdrs = ["client_library.h"],
     deps = [
+        ":compile_only_client",
         ":local_client",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:backend",
+        "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index 93437023bc8956e449f828f5bf6dea7a6bff8610..8238261e1c90cadeda9005e437d684d3770bd67b 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -43,6 +43,16 @@ int LocalClientOptions::number_of_replicas() const {
   return number_of_replicas_;
 }
 
+LocalClientOptions& LocalClientOptions::set_intra_op_parallelism_threads(
+    int num_threads) {
+  intra_op_parallelism_threads_ = num_threads;
+  return *this;
+}
+
+int LocalClientOptions::intra_op_parallelism_threads() const {
+  return intra_op_parallelism_threads_;
+}
+
 /* static */ ClientLibrary& ClientLibrary::Singleton() {
   static ClientLibrary* c = new ClientLibrary;
   return *c;
@@ -69,22 +79,24 @@ ClientLibrary::~ClientLibrary() = default;
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
 
-  auto it = client_library.instances_.find(platform->id());
-  if (it != client_library.instances_.end()) {
+  auto it = client_library.local_instances_.find(platform->id());
+  if (it != client_library.local_instances_.end()) {
     return it->second->client.get();
   }
 
   ServiceOptions service_options;
   service_options.set_platform(platform);
   service_options.set_number_of_replicas(replica_count);
+  service_options.set_intra_op_parallelism_threads(
+      options.intra_op_parallelism_threads());
 
-  std::unique_ptr<LocalInstance> instance = MakeUnique<LocalInstance>();
+  auto instance = MakeUnique<LocalInstance>();
   TF_ASSIGN_OR_RETURN(instance->service,
                       LocalService::NewService(service_options));
   instance->client = MakeUnique<LocalClient>(instance->service.get());
   LocalClient* cl = instance->client.get();
 
-  client_library.instances_.insert(
+  client_library.local_instances_.insert(
       std::make_pair(platform->id(), std::move(instance)));
   return cl;
 }
@@ -99,9 +111,35 @@ ClientLibrary::~ClientLibrary() = default;
     perftools::gputools::Platform* platform) {
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
-  auto it = client_library.instances_.find(platform->id());
-  CHECK(it != client_library.instances_.end());
+  auto it = client_library.local_instances_.find(platform->id());
+  CHECK(it != client_library.local_instances_.end());
   return it->second->service.get();
 }
 
+/* static */ StatusOr<CompileOnlyClient*>
+ClientLibrary::GetOrCreateCompileOnlyClient(
+    perftools::gputools::Platform* platform) {
+  ClientLibrary& client_library = Singleton();
+  tensorflow::mutex_lock lock(client_library.service_mutex_);
+
+  if (platform == nullptr) {
+    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
+  }
+
+  auto it = client_library.compile_only_instances_.find(platform->id());
+  if (it != client_library.compile_only_instances_.end()) {
+    return it->second->client.get();
+  }
+
+  auto instance = MakeUnique<CompileOnlyInstance>();
+  TF_ASSIGN_OR_RETURN(instance->service,
+                      CompileOnlyService::NewService(platform));
+  instance->client = MakeUnique<CompileOnlyClient>(instance->service.get());
+  CompileOnlyClient* cl = instance->client.get();
+
+  client_library.compile_only_instances_.insert(
+      std::make_pair(platform->id(), std::move(instance)));
+  return cl;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index 2bc319f9333368635690add017ad3d89947e2551..3ddd235d0efeeb78f49eafbf670d7c74a88960dd 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -26,7 +26,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -51,9 +53,14 @@ class LocalClientOptions {
   LocalClientOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
 
+  // Sets the thread pool size for parallel execution of an individual operator.
+  LocalClientOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
  private:
   perftools::gputools::Platform* platform_ = nullptr;
   int number_of_replicas_ = -1;
+  int intra_op_parallelism_threads_ = -1;
 };
 
 class ClientLibrary {
@@ -76,6 +83,13 @@ class ClientLibrary {
   // access user computations from client.
   static LocalService* GetXlaService(perftools::gputools::Platform* platform);
 
+  // Singleton constructor-or-accessor for compile-only clients. Arguments:
+  //
+  //   platform : The platform the underlying XLA service should target. If
+  //     null then default platform is used.
+  static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
+      perftools::gputools::Platform* platform = nullptr);
+
  private:
   // Returns the singleton instance of ClientLibrary.
   static ClientLibrary& Singleton();
@@ -90,10 +104,21 @@ class ClientLibrary {
     std::unique_ptr<LocalClient> client;
   };
 
+  struct CompileOnlyInstance {
+    // Service that is wrapped by the singleton client object.
+    std::unique_ptr<CompileOnlyService> service;
+    // Singleton client object.
+    std::unique_ptr<CompileOnlyClient> client;
+  };
+
   tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
   std::unordered_map<perftools::gputools::Platform::Id,
                      std::unique_ptr<LocalInstance>>
-      instances_ GUARDED_BY(service_mutex_);
+      local_instances_ GUARDED_BY(service_mutex_);
+
+  std::unordered_map<perftools::gputools::Platform::Id,
+                     std::unique_ptr<CompileOnlyInstance>>
+      compile_only_instances_ GUARDED_BY(service_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
 };
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ff6f0b300f9e2cc776e60bb27a3952356657780
--- /dev/null
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
+
+#include "external/llvm/include/llvm/ADT/Triple.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyClient::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<CompileOnlyService::AotComputationInstance> service_instances;
+  service_instances.reserve(computations.size());
+  for (const AotComputationInstance& instance : computations) {
+    service_instances.push_back({});
+    CompileOnlyService::AotComputationInstance& service_instance =
+        service_instances.back();
+    TF_RET_CHECK(instance.computation != nullptr);
+    service_instance.computation = instance.computation->handle();
+    service_instance.argument_layouts = instance.argument_layouts;
+    service_instance.result_layout = instance.result_layout;
+  }
+  return compiler_service_->CompileAheadOfTime(service_instances, options);
+}
+
+int64 CompileOnlyClient::PointerSizeForTriple(
+    tensorflow::StringPiece target_triple) {
+  llvm::Triple triple(
+      llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
+  if (triple.isArch64Bit()) {
+    return 8;
+  } else if (triple.isArch32Bit()) {
+    return 4;
+  } else {
+    CHECK(triple.isArch16Bit());
+    return 2;
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..5900048711384e0240a3cd502260eb388eb40f51
--- /dev/null
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// An XLA Client specialization for doing ahead-of-time compilation.  This does
+// not require (or attempt to instantiate) an execution-capable backend for the
+// relevant platform.
+class CompileOnlyClient : public Client {
+ public:
+  explicit CompileOnlyClient(CompileOnlyService* service)
+      : Client(service), compiler_service_(service) {}
+
+  CompileOnlyClient(const CompileOnlyClient&) = delete;
+  void operator=(const CompileOnlyClient&) = delete;
+
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AotComputationInstance {
+    const Computation* computation;
+    // Inform the compiler of the expected layout for arguments.
+    std::vector<const Shape*> argument_layouts;
+    // Specifies the expected result layout.
+    const Shape* result_layout;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation. The |options| parameter describes
+  // the target for which the compiler should emit code.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+      const AotCompilationOptions& options);
+
+  // Returns the size of a pointer in bytes for a given triple.
+  static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
+
+ private:
+  CompileOnlyService* compiler_service_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 87ceb43d1fe6650e1d160f3099b883ea208d8aac..6af69eeec12dec0ea1303826859d4655cf92932e 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -668,6 +668,14 @@ class ComputationBuilder {
   // then Build() should be used instead.
   Computation BuildAndNoteError();
 
+  // Returns the first error that was encountered while building the
+  // computation. When an error is encountered, by default we return a vacuous
+  // ComputationDataHandle and inform the user of the error that occurred while
+  // building the computation when they make a final call to Build().
+  //
+  // See also set_die_immediately_on_error().
+  Status first_error() const { return first_error_; }
+
  private:
   using PopulateLiteral = std::function<void(Literal*)>;
 
diff --git a/tensorflow/compiler/xla/client/global_data.h b/tensorflow/compiler/xla/client/global_data.h
index eb11d91034ba524f093ff80fa7cd0473e04eac2c..b7929357d06032b55c04bf0391f7fa703ee15f17 100644
--- a/tensorflow/compiler/xla/client/global_data.h
+++ b/tensorflow/compiler/xla/client/global_data.h
@@ -23,13 +23,15 @@ limitations under the License.
 
 namespace xla {
 
-// Wraps a GlobalDataHandle with a lifetime.
+// A GlobalData object represents a globally-accessible allocation of
+// data in the associated XLA service.
 class GlobalData {
  public:
   // Gives ownership of the global data handle to this object.
   GlobalData(ServiceInterface* parent, GlobalDataHandle handle);
 
-  // Unregisters the wrapped handle.
+  // Unregisters the wrapped handle, which causes the service to
+  // deallocate the associated data.
   ~GlobalData();
 
   const GlobalDataHandle& handle() const { return handle_; }
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index bfd14bc1c010353e3e473f10dd6c030cb0438648..02cf57e7632a2064e646d4dc441e3ec119053564 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -176,17 +176,24 @@ StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::Run(
   TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
 
   ExecutableRunOptions actual_options = options;
-  Backend::StreamPtr stream;
   if (options.stream() == nullptr) {
     TF_ASSIGN_OR_RETURN(
-        stream, BorrowStreamForDevice(options.device_ordinal(), backend_));
+        Backend::StreamPtr stream,
+        BorrowStreamForDevice(options.device_ordinal(), backend_));
     actual_options.set_stream(stream.get());
   }
   if (options.allocator() == nullptr) {
     actual_options.set_allocator(backend_->memory_allocator());
   }
-  ServiceExecutableRunOptions service_options(actual_options,
-                                              backend_->StreamBorrower());
+
+  // For local client execution on CPU backends:
+  // *) The thread pool used for eigen CPU ops is from
+  //    ExecutableRunOptions.eigen_intra_op_thread_pool.
+  // *) The thread pool used for XLA CPU ops is from
+  //    backend_->eigen_intra_op_thread_pool().
+  ServiceExecutableRunOptions service_options(
+      actual_options, backend_->StreamBorrower(),
+      backend_->eigen_intra_op_thread_pool());
 
   if (executable_->dumping()) {
     return ExecuteAndDump(&service_options, arguments);
@@ -253,46 +260,6 @@ StatusOr<std::unique_ptr<GlobalData>> LocalClient::AllocateBufferOnDevice(
   return std::unique_ptr<GlobalData>(new GlobalData(local_service_, handle));
 }
 
-tensorflow::Status LocalClient::ResolveArguments(
-    const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs) {
-  return local_service_->ResolveArguments(arguments, device_ordinal,
-                                          argument_ptrs);
-}
-
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-LocalClient::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-        computations,
-    const AotCompilationOptions& options) {
-  std::vector<LocalService::AheadOfTimeComputationInstance> service_instances;
-  service_instances.reserve(computations.size());
-  for (const AheadOfTimeComputationInstance& instance : computations) {
-    service_instances.push_back({});
-    LocalService::AheadOfTimeComputationInstance& service_instance =
-        service_instances.back();
-    TF_RET_CHECK(instance.computation != nullptr);
-    service_instance.computation = instance.computation->handle();
-    service_instance.argument_layouts = instance.argument_layouts;
-    service_instance.result_layout = instance.result_layout;
-  }
-  return local_service_->CompileAheadOfTime(service_instances, options);
-}
-
-int64 LocalClient::PointerSizeForTriple(tensorflow::StringPiece target_triple) {
-  llvm::Triple triple(
-      llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
-  if (triple.isArch64Bit()) {
-    return 8;
-  } else if (triple.isArch32Bit()) {
-    return 4;
-  } else {
-    CHECK(triple.isArch16Bit());
-    return 2;
-  }
-}
-
 se::Platform* LocalClient::platform() const {
   return local_service_->backend().platform();
 }
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 2c467efcea119b66ad08e0636eca0f1acec3a3b8..49ffed4dde6ba9b6683d42cefec593a0c35bca6e 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -148,7 +148,7 @@ class LocalExecutable {
   const ExecutableBuildOptions& build_options_;
 };
 
-// An XLA service client object for use when the client and service run in
+// An XLA Client specialization for use when the client and service run in
 // the same process.
 class LocalClient : public Client {
  public:
@@ -158,14 +158,6 @@ class LocalClient : public Client {
   LocalClient(const LocalClient&) = delete;
   void operator=(const LocalClient&) = delete;
 
-  // For an array of arguments held on the local service, validate
-  // that each is placed on the specified device_ordinal, and return
-  // the DeviceMemoryBase corresponding to each argument.
-  tensorflow::Status ResolveArguments(
-      const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs);
-
   // Return a handle to a buffer large enough to hold shape, allocated
   // on device_ordinal on the local service. If
   // allocate_space_for_deep_copy, the buffer is large enough to hold
@@ -182,30 +174,6 @@ class LocalClient : public Client {
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
       const ExecutableBuildOptions& options);
 
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AheadOfTimeComputationInstance {
-    const Computation* computation;
-    // Inform the compiler of the expected layout for arguments.
-    std::vector<const Shape*> argument_layouts;
-    // Specifies the expected result layout.
-    const Shape* result_layout;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation. The |options| parameter describes
-  // the target for which the compiler should emit code.
-  //
-  // TODO(b/31222190): This doesn't really belong in LocalClient. Move it to its
-  // own library.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-          computations,
-      const AotCompilationOptions& options);
-
-  // Returns the size of a pointer in bytes for a given triple.
-  static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
-
   // Returns the platform that the underlying service targets.
   perftools::gputools::Platform* platform() const;
 
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 92aca3cae9e442453d3726972179d126959dca2f..76c0168f370ff1f0749759705b7ecff359a80341 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -131,4 +131,23 @@ namespace xla {
   return false;
 }
 
+/* static */ int64 IndexUtil::GetDimensionStride(const Shape& shape,
+                                                 int64 dimension) {
+  const Layout& layout = shape.layout();
+  int64 pdim_size = layout.padded_dimensions_size();
+  int64 stride = 1;
+  DCHECK(pdim_size == 0 || pdim_size == shape.dimensions_size());
+  for (auto dim : layout.minor_to_major()) {
+    if (dim == dimension) {
+      break;
+    }
+    if (pdim_size == 0) {
+      stride *= shape.dimensions(dim);
+    } else {
+      stride *= layout.padded_dimensions(dim);
+    }
+  }
+  return stride;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h
index e6a26d622016c89f5459a50bd0f733daef469fae..c9838966a5b67397eb5fc4afe3ab9d98e82eb2b1 100644
--- a/tensorflow/compiler/xla/index_util.h
+++ b/tensorflow/compiler/xla/index_util.h
@@ -61,6 +61,14 @@ class IndexUtil {
   static bool BumpIndices(const Shape& shape,
                           tensorflow::gtl::MutableArraySlice<int64> indices);
 
+  // Calculates the stride size (in number of elements, not byte size) of a
+  // given logical shape dimension (from 0 to rank-1). If available, padded
+  // dimensions are used.
+  // Example:
+  //  GetDimensionStride(F32[5,8,10,4]{3,2,1,0}, 1) ==
+  //    sizeof(dimension(3)) * sizeof(dimension(2)) == 4 * 10
+  static int64 GetDimensionStride(const Shape& shape, int64 dimension);
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(IndexUtil);
 };
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 7091c324d14552d8b7603c3872d0ffc59771d8f7..0f622f9153436f58b05a4b5f4ea1dc0576da3e23 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 
 #include <algorithm>
+#include <cstring>
+#include <functional>
 #include <limits>
 #include <numeric>
 #include <vector>
 
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -33,6 +36,137 @@ limitations under the License.
 
 namespace xla {
 
+LiteralUtil::StrideConfig::StrideConfig(
+    const Shape& source_shape, const Shape& dest_shape,
+    tensorflow::gtl::ArraySlice<int64> dimensions)
+    : dimensions(dimensions),
+      base(dimensions.size(), 0),
+      step(dimensions.size(), 1) {
+  if (!dimensions.empty()) {
+    // Selects the shape with the highest minor dimension as the one upon
+    // where to run the tight stride loop.
+    if (source_shape.layout().minor_to_major()[0] >=
+        dest_shape.layout().minor_to_major()[0]) {
+      minor_dimension = source_shape.layout().minor_to_major()[0];
+      dest_stride = IndexUtil::GetDimensionStride(dest_shape, minor_dimension);
+    } else {
+      minor_dimension = dest_shape.layout().minor_to_major()[0];
+      source_stride =
+          IndexUtil::GetDimensionStride(source_shape, minor_dimension);
+    }
+    minor_loop_size = dimensions[minor_dimension];
+    step[minor_dimension] = minor_loop_size;
+  }
+}
+
+/* static */ std::unique_ptr<Literal> LiteralUtil::CreateFromShape(
+    const Shape& shape) {
+  auto literal = MakeUnique<Literal>();
+  *literal->mutable_shape() = shape;
+  Reserve(ShapeUtil::ElementsIn(literal->shape()), literal.get());
+  return literal;
+}
+
+/* static */ std::unique_ptr<Literal> LiteralUtil::CreateFromDimensions(
+    PrimitiveType primitive_type,
+    tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return CreateFromShape(ShapeUtil::MakeShape(primitive_type, dimensions));
+}
+
+template <typename T>
+/* static */ Status LiteralUtil::CopyRange(
+    const Literal& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
+    Literal* dest_literal, tensorflow::gtl::ArraySlice<int64> dest_base,
+    tensorflow::gtl::ArraySlice<int64> copy_size) {
+  const Shape& src_shape = src_literal.shape();
+  const Shape& dest_shape = dest_literal->shape();
+  tensorflow::gtl::ArraySlice<T> src_data = GetArraySlice<T>(src_literal);
+  tensorflow::gtl::MutableArraySlice<T> dest_data =
+      GetMutableArraySlice<T>(dest_literal);
+
+  TF_RET_CHECK(ShapeUtil::Rank(src_shape) == src_base.size());
+  TF_RET_CHECK(ShapeUtil::Rank(dest_shape) == dest_base.size());
+  if (ShapeUtil::Rank(src_shape) == 0 || ShapeUtil::Rank(dest_shape) == 0) {
+    // If any of the two shapes are scalars, we can just call the StridedCopy()
+    // directly, and we know we will be copying only one value.
+    TF_RET_CHECK(copy_size.empty());
+    StridedCopy(dest_data, LinearIndex(*dest_literal, dest_base), 0, src_data,
+                LinearIndex(src_literal, src_base), 0, 1);
+  } else if (!ShapeUtil::HasZeroElements(dest_shape)) {
+    TF_RET_CHECK(!ShapeUtil::HasZeroElements(src_shape));
+    TF_RET_CHECK(src_base.size() == dest_base.size());
+    TF_RET_CHECK(src_base.size() == copy_size.size());
+
+    // Scan the source from minor, stepping in copy size blocks, then within
+    // the index enumaration functor, do a strided copy advancing source index
+    // by one (walking through the minor dimension), and destination index by
+    // proper stride size at the matching dimension.
+    DimensionVector src_indexes(src_base.size(), 0);
+    DimensionVector dest_indexes(dest_base.size(), 0);
+    StrideConfig stride_config(src_shape, dest_shape, copy_size);
+
+    auto copy_proc = [&](const std::vector<int64>& indexes) {
+      // Map from multi-dimensional index, to source index.
+      std::transform(indexes.begin(), indexes.end(), src_base.begin(),
+                     src_indexes.begin(), std::plus<int64>());
+      // Map from multi-dimensional index, to destination index.
+      std::transform(indexes.begin(), indexes.end(), dest_base.begin(),
+                     dest_indexes.begin(), std::plus<int64>());
+
+      int64 src_index = LinearIndex(src_literal, src_indexes);
+      int64 dest_index = LinearIndex(*dest_literal, dest_indexes);
+
+      StridedCopy(dest_data, dest_index, stride_config.dest_stride, src_data,
+                  src_index, stride_config.source_stride,
+                  stride_config.minor_loop_size);
+      return true;
+    };
+
+    ShapeUtil::ForEachIndex(src_shape, stride_config.base,
+                            stride_config.dimensions, stride_config.step,
+                            copy_proc);
+  }
+  return Status::OK();
+}
+
+/* static */ Status LiteralUtil::Copy(
+    const Literal& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
+    Literal* dest_literal, tensorflow::gtl::ArraySlice<int64> dest_base,
+    tensorflow::gtl::ArraySlice<int64> copy_size) {
+  TF_RET_CHECK(
+      ShapeUtil::SameElementType(src_literal.shape(), dest_literal->shape()));
+  switch (src_literal.shape().element_type()) {
+    case U32:
+      return CopyRange<uint32>(src_literal, src_base, dest_literal, dest_base,
+                               copy_size);
+    case U64:
+      return CopyRange<uint64>(src_literal, src_base, dest_literal, dest_base,
+                               copy_size);
+    case S32:
+      return CopyRange<int32>(src_literal, src_base, dest_literal, dest_base,
+                              copy_size);
+    case S64:
+      return CopyRange<int64>(src_literal, src_base, dest_literal, dest_base,
+                              copy_size);
+    case F16:
+      return CopyRange<half>(src_literal, src_base, dest_literal, dest_base,
+                             copy_size);
+    case F32:
+      return CopyRange<float>(src_literal, src_base, dest_literal, dest_base,
+                              copy_size);
+    case F64:
+      return CopyRange<double>(src_literal, src_base, dest_literal, dest_base,
+                               copy_size);
+    case PRED:
+      return CopyRange<bool>(src_literal, src_base, dest_literal, dest_base,
+                             copy_size);
+    default:
+      break;
+  }
+  return Unimplemented("Unhandled primitive type %d",
+                       src_literal.shape().element_type());
+}
+
 /* static */ Literal LiteralUtil::Zero(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
@@ -47,6 +181,8 @@ namespace xla {
       return *LiteralUtil::CreateR0<int32>(0);
     case S64:
       return *LiteralUtil::CreateR0<int64>(0);
+    case F16:
+      return *LiteralUtil::CreateR0<half>(static_cast<half>(0.0f));
     case F32:
       return *LiteralUtil::CreateR0<float>(0);
     case F64:
@@ -56,8 +192,6 @@ namespace xla {
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
-    case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 0";
     case OPAQUE:
@@ -91,7 +225,7 @@ namespace xla {
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      return *LiteralUtil::CreateR0<half>(static_cast<half>(1.0f));
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 1";
     case OPAQUE:
@@ -127,7 +261,8 @@ namespace xla {
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      return *LiteralUtil::CreateR0<half>(
+              static_cast<half>(-std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no minimum value";
     case OPAQUE:
@@ -163,7 +298,8 @@ namespace xla {
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      return *LiteralUtil::CreateR0<half>(
+              static_cast<half>(std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no maximum value";
     case OPAQUE:
@@ -197,37 +333,16 @@ namespace xla {
 
 /* static */ std::unique_ptr<Literal> LiteralUtil::Relayout(
     const Literal& original, const Layout& layout) {
-  // Note: if this were a performance bottleneck, we avoid cloning and just make
-  // an uninitialized array instead, since all values are clobbered below.
   std::unique_ptr<Literal> result = CloneToUnique(original);
   *result->mutable_shape()->mutable_layout() = layout;
-  const PrimitiveType primitive_type = original.shape().element_type();
-  switch (primitive_type) {
-    case F32:
-      LiteralUtil::EachCell<float>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
-            LiteralUtil::Set<float>(result.get(), indices, value);
-          });
-      return result;
-    case S32:
-      LiteralUtil::EachCell<int32>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, int32 value) {
-            LiteralUtil::Set<int32>(result.get(), indices, value);
-          });
-      return result;
-    case U32:
-      LiteralUtil::EachCell<uint32>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, uint32 value) {
-            LiteralUtil::Set<uint32>(result.get(), indices, value);
-          });
-      return result;
-    default:
-      LOG(FATAL) << "not yet implemented: "
-                 << PrimitiveType_Name(primitive_type);
-  }
+
+  const Shape& shape = original.shape();
+  DimensionVector base(ShapeUtil::Rank(shape), 0);
+  DimensionVector copy_size(shape.dimensions().begin(),
+                            shape.dimensions().end());
+
+  TF_CHECK_OK(Copy(original, base, result.get(), base, copy_size));
+  return result;
 }
 
 /* static */ StatusOr<std::unique_ptr<Literal>> LiteralUtil::Reshape(
@@ -235,25 +350,19 @@ namespace xla {
   if (ShapeUtil::IsTuple(input.shape())) {
     return InvalidArgument("Reshape does not support tuples.");
   }
-
+  std::unique_ptr<Literal> output;
   if (!LayoutUtil::IsMonotonicWithDim0Major(input.shape().layout())) {
-    return Unimplemented(
-        "Input shape must have a monotonic layout where dimension 0 is major, "
-        "was: %s",
-        LayoutUtil::HumanString(input.shape().layout()).c_str());
+    std::vector<int64> minor_to_major(ShapeUtil::Rank(input.shape()));
+    std::iota(minor_to_major.rbegin(), minor_to_major.rend(),
+              static_cast<int64>(0));
+    output = Relayout(input, LayoutUtil::MakeLayout(minor_to_major));
+  } else {
+    output = CloneToUnique(input);
   }
-  std::vector<int64> layout(dimensions.size());
-  std::iota(layout.rbegin(), layout.rend(), 0);
-
   // Because the layout is monotonic, we can simply reuse the same sequence of
   // values without changing their order.
-  std::unique_ptr<Literal> output = CloneToUnique(input);
-  output->clear_shape();
-  output->mutable_shape()->set_element_type(input.shape().element_type());
-  for (int64 dimension : dimensions) {
-    output->mutable_shape()->add_dimensions(dimension);
-  }
-  *output->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout(layout);
+  *output->mutable_shape() =
+      ShapeUtil::MakeShape(input.shape().element_type(), dimensions);
 
   int64 elements_before = ShapeUtil::ElementsIn(input.shape());
   int64 elements_after = ShapeUtil::ElementsIn(output->shape());
@@ -267,73 +376,42 @@ namespace xla {
   return std::move(output);
 }
 
-namespace {
-
-template <class T>
-void TransposeLiteralInternal(const Literal& original,
-                              tensorflow::gtl::ArraySlice<int64> permutation,
-                              Literal* result) {
-  std::vector<int64> new_indices(ShapeUtil::Rank(original.shape()));
-  LiteralUtil::EachCell<T>(
-      original, [&](tensorflow::gtl::ArraySlice<int64> indices, T value) {
-        for (int64 i = 0; i < indices.size(); ++i) {
-          new_indices[i] = indices[permutation[i]];
-        }
-        LiteralUtil::Set<T>(result, new_indices, value);
-      });
-}
-}  // namespace
-
 /* static */ std::unique_ptr<Literal> LiteralUtil::Transpose(
     const Literal& original, tensorflow::gtl::ArraySlice<int64> permutation) {
   CHECK(!ShapeUtil::IsTuple(original.shape()))
-      << "tuple is not supported for transpose";
-  std::vector<int64> dimension_numbers(ShapeUtil::Rank(original.shape()));
-  std::iota(dimension_numbers.begin(), dimension_numbers.end(), 0);
-  CHECK(std::is_permutation(permutation.begin(), permutation.end(),
-                            dimension_numbers.begin()))
-      << "given permutation is not a permutation of dimension numbers";
-  std::vector<int64> new_dimension_sizes;
-  for (const int64 dim : permutation) {
-    new_dimension_sizes.push_back(original.shape().dimensions(dim));
-  }
-  const auto result_shape = ShapeUtil::MakeShape(
-      original.shape().element_type(), new_dimension_sizes);
-  std::unique_ptr<Literal> result = CloneToUnique(original);
-  *result->mutable_shape() = result_shape;
-  const PrimitiveType primitive_type = original.shape().element_type();
-  switch (primitive_type) {
-    case F32:
-      TransposeLiteralInternal<float>(original, permutation, result.get());
-      return result;
-    case F64:
-      TransposeLiteralInternal<double>(original, permutation, result.get());
-      return result;
-    case PRED:
-      TransposeLiteralInternal<bool>(original, permutation, result.get());
-      return result;
-    case S8:
-      TransposeLiteralInternal<int8>(original, permutation, result.get());
-      return result;
-    case U8:
-      TransposeLiteralInternal<uint8>(original, permutation, result.get());
-      return result;
-    case S32:
-      TransposeLiteralInternal<int32>(original, permutation, result.get());
-      return result;
-    case U32:
-      TransposeLiteralInternal<uint32>(original, permutation, result.get());
-      return result;
-    case S64:
-      TransposeLiteralInternal<int64>(original, permutation, result.get());
-      return result;
-    case U64:
-      TransposeLiteralInternal<uint64>(original, permutation, result.get());
-      return result;
-    default:
-      LOG(FATAL) << "not yet implemented: "
-                 << PrimitiveType_Name(primitive_type);
+      << "Tuple is not supported for transpose";
+  CHECK(IsPermutation(permutation, ShapeUtil::Rank(original.shape())))
+      << "Given permutation is not a permutation of dimension numbers";
+  // To transpose the array, we just permute the dimensions and layout, and
+  // do a straight memory copy of the raw data set.
+  // This is considerably faster than iterating over every array element using
+  // the EachCell<>() and Set<>() APIs.
+  std::vector<int64> inverse_permutation = InversePermutation(permutation);
+  Shape shape =
+      ShapeUtil::PermuteDimensions(inverse_permutation, original.shape());
+  // Replace the layout with one affine to the original shape, such that a
+  // transpose operation can be performed by leaving the flat values
+  // representation intact.
+  // For example, consider the shape F32[11,8]{1,0} under a {1,0} permutation.
+  // The shape with affine layout resulting from that operation will be
+  // F32[8,11]{0,1}, since it leave the original most minor (the 8 sized), the
+  // most minor.
+  // Essentially, given MinMaj(Di) the position of the Di dimension within the
+  // minor to major vector, and given T(Di) the index that the original Di
+  // dimension has within the transposed array, a layout is affine if
+  // MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major
+  // vector of the affine layout.
+  Layout* layout = shape.mutable_layout();
+  layout->clear_minor_to_major();
+  for (auto index : original.shape().layout().minor_to_major()) {
+    layout->add_minor_to_major(inverse_permutation[index]);
   }
+  std::unique_ptr<Literal> new_literal = CreateFromShape(shape);
+  DCHECK_GE(ShapeUtil::ByteSizeOf(new_literal->shape()),
+            ShapeUtil::ByteSizeOf(original.shape()));
+  std::memcpy(MutableInternalData(new_literal.get()), InternalData(original),
+              ShapeUtil::ByteSizeOf(original.shape()));
+  return new_literal;
 }
 
 /* static */ std::unique_ptr<Literal> LiteralUtil::Slice(
@@ -342,7 +420,7 @@ void TransposeLiteralInternal(const Literal& original,
   CHECK(!ShapeUtil::IsTuple(literal.shape()))
       << "tuple is not supported for reshape";
 
-  std::vector<int64> result_dimensions;
+  DimensionVector result_dimensions;
   for (int64 dnum = 0; dnum < ShapeUtil::Rank(literal.shape()); ++dnum) {
     CHECK_GE(start_indices[dnum], 0);
     CHECK_LE(limit_indices[dnum], literal.shape().dimensions(dnum));
@@ -358,7 +436,7 @@ void TransposeLiteralInternal(const Literal& original,
   *result_literal->mutable_shape() = result_shape;
   Reserve(ShapeUtil::ElementsIn(result_shape), result_literal.get());
 
-  std::vector<int64> new_indices(ShapeUtil::Rank(result_shape));
+  DimensionVector new_indices(ShapeUtil::Rank(result_shape));
   switch (result_shape.element_type()) {
     case F32:
       LiteralUtil::EachCell<float>(
@@ -425,6 +503,8 @@ void TransposeLiteralInternal(const Literal& original,
       return tensorflow::strings::StrCat(Get<float>(literal, multi_index));
     case F64:
       return tensorflow::strings::StrCat(Get<double>(literal, multi_index));
+    case F16:
+      return tensorflow::strings::StrCat(Get<half>(literal, multi_index));
     default:
       return tensorflow::strings::StrCat(
           "[", PrimitiveType_Name(literal.shape().element_type()), "]");
@@ -579,6 +659,8 @@ void TransposeLiteralInternal(const Literal& original,
       return reinterpret_cast<const void*>(literal.f32s().data());
     case F64:
       return reinterpret_cast<const void*>(literal.f64s().data());
+    case F16:
+      return reinterpret_cast<const void*>(literal.f16s().data());
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
                  << PrimitiveType_Name(literal.shape().element_type());
@@ -593,38 +675,33 @@ void TransposeLiteralInternal(const Literal& original,
   CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
   switch (literal->shape().element_type()) {
     case PRED:
-      GetMutableRepeatedField<bool>(literal)->Resize(num_elements, false);
+      Resize<bool>(num_elements, false, literal);
+      break;
+    case S8:
+      Resize<int8>(num_elements, 0, literal);
       break;
     case U8:
-      // u8s is an optional "bytes", rather than a repeated field. Therefore its
-      // access methods are somewhat different from the others.
-      literal->mutable_u8s()->resize(num_elements, 0);
+      Resize<uint8>(num_elements, 0, literal);
       break;
     case S32:
-      GetMutableRepeatedField<int32>(literal)->Resize(num_elements,
-                                                      /*value=*/0);
+      Resize<int32>(num_elements, 0, literal);
       break;
     case S64:
-      GetMutableRepeatedField<tensorflow::protobuf_int64>(literal)->Resize(
-          num_elements,
-          /*value=*/0);
+      Resize<int64>(num_elements, 0, literal);
       break;
     case U32:
-      GetMutableRepeatedField<uint32>(literal)->Resize(num_elements,
-                                                       /*value=*/0);
+      Resize<uint32>(num_elements, 0, literal);
       break;
     case U64:
-      GetMutableRepeatedField<tensorflow::protobuf_uint64>(literal)->Resize(
-          num_elements,
-          /*value=*/0);
+      Resize<uint64>(num_elements, 0, literal);
       break;
     case F32:
-      GetMutableRepeatedField<float>(literal)->Resize(num_elements,
-                                                      /*value=*/0.0f);
+      Resize<float>(num_elements, 0, literal);
       break;
     case F64:
-      GetMutableRepeatedField<double>(literal)->Resize(num_elements,
-                                                       /*value=*/0.0);
+      Resize<double>(num_elements, 0, literal);
+    case F16:
+      Resize<half>(num_elements, static_cast<half>(0.0f), literal);
       break;
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
@@ -662,6 +739,9 @@ void TransposeLiteralInternal(const Literal& original,
     case F64:
       actual = literal.f64s_size();
       break;
+    case F16:
+      actual = literal.f16s().size() / sizeof(half);
+      break;
     default:
       return tensorflow::errors::Unimplemented(
           "unhandled element type for literal validation: " +
@@ -680,50 +760,16 @@ void TransposeLiteralInternal(const Literal& original,
 
 /* static */ void LiteralUtil::EachCellAsString(
     const Literal& literal,
-    std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                       const string& value)>
-        per_cell) {
-  if (ShapeUtil::Rank(literal.shape()) == 1) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      per_cell({i0}, GetAsString(literal, {i0}));
-    }
-    return;
-  }
-
-  if (ShapeUtil::Rank(literal.shape()) == 2) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        per_cell({i0, i1}, GetAsString(literal, {i0, i1}));
-      }
-    }
+    const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                             const string& value)>& per_cell) {
+  if (ShapeUtil::HasZeroElements(literal.shape())) {
     return;
   }
-
-  if (ShapeUtil::Rank(literal.shape()) == 3) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
-          per_cell({i0, i1, i2}, GetAsString(literal, {i0, i1, i2}));
-        }
-      }
-    }
-    return;
-  }
-
-  if (ShapeUtil::Rank(literal.shape()) == 4) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
-          for (int64 i3 = 0; i3 < literal.shape().dimensions(3); ++i3) {
-            per_cell({i0, i1, i2, i3}, GetAsString(literal, {i0, i1, i2, i3}));
-          }
-        }
-      }
-    }
-    return;
-  }
-
-  LOG(FATAL) << "unhandled rank: " << ShapeUtil::Rank(literal.shape());
+  std::vector<int64> indices = IndexUtil::LinearIndexToMultidimensionalIndex(
+      literal.shape(), /*linear_index=*/0);
+  do {
+    per_cell(indices, GetAsString(literal, indices));
+  } while (IndexUtil::BumpIndices(literal.shape(), &indices));
 }
 
 namespace {
@@ -786,6 +832,8 @@ bool EqualElements(const Literal& literal1, const Literal& literal2,
         return EqualElements<float>(literal1, literal2, 0, &multi_index);
       case F64:
         return EqualElements<double>(literal1, literal2, 0, &multi_index);
+      case F16:
+        return EqualElements<half>(literal1, literal2, 0, &multi_index);
       default:
         LOG(FATAL) << "Unimplemented: LiteralUtil::Equal for type "
                    << PrimitiveType_Name(literal1.shape().element_type());
@@ -794,96 +842,176 @@ bool EqualElements(const Literal& literal1, const Literal& literal2,
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<bool> LiteralUtil::GetArraySlice<bool>(
-    const Literal& literal) {
-  CHECK(literal.shape().element_type() == PRED);
-  return literal.preds();
+/* static */ tensorflow::gtl::MutableArraySlice<bool>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_preds();
+  return tensorflow::gtl::MutableArraySlice<bool>(values->mutable_data(),
+                                                  values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<bool>*
-LiteralUtil::GetMutableRepeatedField<bool>(Literal* literal) {
-  CHECK(literal->shape().element_type() == PRED);
-  return literal->mutable_preds();
+/* static */ tensorflow::gtl::MutableArraySlice<int8>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  // C++11 standard, basic_string 21.4.1.5, values should be stored
+  // contiguously. From C++17 a mutable data() member will be provided.
+  auto values = literal->mutable_u8s();
+  return tensorflow::gtl::MutableArraySlice<int8>(
+      reinterpret_cast<int8*>(&(*values)[0]), values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint32>
-LiteralUtil::GetArraySlice<uint32>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == U32);
-  return literal.u32s();
+/* static */ tensorflow::gtl::MutableArraySlice<uint8>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  // C++11 standard, basic_string 21.4.1.5, values should be stored
+  // contiguously. From C++17 a mutable data() member will be provided.
+  auto values = literal->mutable_u8s();
+  return tensorflow::gtl::MutableArraySlice<uint8>(
+      reinterpret_cast<uint8*>(&(*values)[0]), values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<uint32>*
-LiteralUtil::GetMutableRepeatedField<uint32>(Literal* literal) {
-  CHECK(literal->shape().element_type() == U32);
-  return literal->mutable_u32s();
+/* static */ tensorflow::gtl::MutableArraySlice<int32>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_s32s();
+  return tensorflow::gtl::MutableArraySlice<int32>(values->mutable_data(),
+                                                   values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint64>
-LiteralUtil::GetArraySlice<uint64>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == U64);
-  return AsUInt64Slice(literal.u64s());
+/* static */ tensorflow::gtl::MutableArraySlice<uint32>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_u32s();
+  return tensorflow::gtl::MutableArraySlice<uint32>(values->mutable_data(),
+                                                    values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_uint64>(
-    Literal* literal) {
-  CHECK(literal->shape().element_type() == U64);
-  return literal->mutable_u64s();
+/* static */ tensorflow::gtl::MutableArraySlice<int64>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  static_assert(sizeof(int64) == sizeof(tensorflow::protobuf_int64) &&
+                    alignof(int64) == alignof(tensorflow::protobuf_int64),
+                "The int64 and tensorflow::protobuf_int64 types are not "
+                "compatible");
+  auto values = literal->mutable_s64s();
+  // Because of the fact that tensorflow::protobuf_int64 is defined as int64_t
+  // while tensorflow::int64 is defined as long long, a reinterpret_cast<> is
+  // necessary from the raw data pointer returned by the mutable_data() API.
+  return tensorflow::gtl::MutableArraySlice<int64>(
+      reinterpret_cast<int64*>(values->mutable_data()), values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<int32>
-LiteralUtil::GetArraySlice<int32>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == S32);
-  return literal.s32s();
+/* static */ tensorflow::gtl::MutableArraySlice<uint64>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  static_assert(sizeof(uint64) == sizeof(tensorflow::protobuf_uint64) &&
+                    alignof(uint64) == alignof(tensorflow::protobuf_uint64),
+                "The uint64 and tensorflow::protobuf_uint64 types are not "
+                "compatible");
+  auto values = literal->mutable_u64s();
+  // Because of the fact that tensorflow::protobuf_uint64 is defined as uint64_t
+  // while tensorflow::uint64 is defined as unsigned long long, a
+  // reinterpret_cast<> is necessary from the raw data pointer returned by the
+  // mutable_data() API.
+  return tensorflow::gtl::MutableArraySlice<uint64>(
+      reinterpret_cast<uint64*>(values->mutable_data()), values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<int32>*
-LiteralUtil::GetMutableRepeatedField<int32>(Literal* literal) {
-  CHECK(literal->shape().element_type() == S32);
-  return literal->mutable_s32s();
+/* static */ tensorflow::gtl::MutableArraySlice<float>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_f32s();
+  return tensorflow::gtl::MutableArraySlice<float>(values->mutable_data(),
+                                                   values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<int64>
-LiteralUtil::GetArraySlice<int64>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == S64);
-  return AsInt64Slice(literal.s64s());
+/* static */ tensorflow::gtl::MutableArraySlice<double>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_f64s();
+  return tensorflow::gtl::MutableArraySlice<double>(values->mutable_data(),
+                                                    values->size());
+}
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<half>
+LiteralUtil::GetMutableArraySlice<half>(Literal* literal) {
+  // C++11 standard, basic_string 21.4.1.5, values should be stored
+  // contiguously. From C++17 a mutable data() member will be provided.
+  // TODO - there is an endianess problem here. fix it, or wait for uint16
+  //        support in protobuf
+  auto values = literal->mutable_f16s();
+  return tensorflow::gtl::MutableArraySlice<half>(
+          reinterpret_cast<half*>(&(*values)[0]),
+          values->size() / sizeof(half));
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<bool> LiteralUtil::GetArraySlice<bool>(
+    const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), PRED);
+  return literal.preds();
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<uint8>
+LiteralUtil::GetArraySlice<uint8>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), U8);
+  return tensorflow::gtl::ArraySlice<uint8>(
+      reinterpret_cast<const uint8*>(literal.u8s().data()),
+      literal.u8s().size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_int64>(
-    Literal* literal) {
-  CHECK(literal->shape().element_type() == S64);
-  return literal->mutable_s64s();
+/* static */ tensorflow::gtl::ArraySlice<int8> LiteralUtil::GetArraySlice<int8>(
+    const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), S8);
+  return tensorflow::gtl::ArraySlice<int8>(
+      reinterpret_cast<const int8*>(literal.u8s().data()),
+      literal.u8s().size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<float>*
-LiteralUtil::GetMutableRepeatedField<float>(Literal* literal) {
-  CHECK(literal->shape().element_type() == F32);
-  return literal->mutable_f32s();
+/* static */ tensorflow::gtl::ArraySlice<uint32>
+LiteralUtil::GetArraySlice<uint32>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), U32);
+  return literal.u32s();
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<uint64>
+LiteralUtil::GetArraySlice<uint64>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), U64);
+  return AsUInt64Slice(literal.u64s());
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<int32>
+LiteralUtil::GetArraySlice<int32>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), S32);
+  return literal.s32s();
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<int64>
+LiteralUtil::GetArraySlice<int64>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), S64);
+  return AsInt64Slice(literal.s64s());
 }
 
 template <>
 /* static */ tensorflow::gtl::ArraySlice<double>
 LiteralUtil::GetArraySlice<double>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == F64);
+  CHECK_EQ(literal.shape().element_type(), F64);
   return literal.f64s();
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<double>*
-LiteralUtil::GetMutableRepeatedField<double>(Literal* literal) {
-  CHECK(literal->shape().element_type() == F64);
-  return literal->mutable_f64s();
+/* static */ tensorflow::gtl::ArraySlice<half>
+LiteralUtil::GetArraySlice<half>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), F16);
+  return tensorflow::gtl::ArraySlice<half>(
+          reinterpret_cast<const half*>(literal.f16s().data()),
+          literal.f16s().size() / sizeof(half));
 }
 
 template <typename NativeT>
@@ -925,6 +1053,8 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
       return AllElementsEqualValue<float>(literal, value);
     case F64:
       return AllElementsEqualValue<double>(literal, value);
+    case F16:
+      return AllElementsEqualValue<half>(literal, static_cast<half>(value));
     case PRED:
       if (value == 0) {
         return AllElementsEqualValue<bool>(literal, false);
@@ -944,6 +1074,8 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
       return AllElementsEqualValue<float>(literal, value);
     case F64:
       return AllElementsEqualValue<double>(literal, value);
+    case F16:
+      return AllElementsEqualValue<half>(literal, static_cast<half>(value));
     default:
       return false;
   }
@@ -968,6 +1100,8 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
       return Get<float>(literal, indices) == 0.0f;
     case F64:
       return Get<double>(literal, indices) == 0.0;
+    case F16:
+      return Get<half>(literal, indices) == static_cast<half>(0.0f);
     case PRED:
       return Get<bool>(literal, indices) == false;
     default:
@@ -976,51 +1110,77 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
 }
 
 template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    int64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<int64>(), dimensions);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_int64>(literal);
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal->shape()); ++i) {
-    repeated_field->Add(value);
-  }
+/* static */ void LiteralUtil::Resize<bool>(int64 num_elements, bool value,
+                                            Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_preds()->Resize(num_elements, value);
 }
 
 template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    uint64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<uint64>(), dimensions);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_uint64>(literal);
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal->shape()); ++i) {
-    repeated_field->Add(value);
-  }
+/* static */ void LiteralUtil::Resize<int8>(int64 num_elements, int8 value,
+                                            Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_u8s()->resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<uint8>(int64 num_elements, uint8 value,
+                                             Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_u8s()->resize(num_elements, value);
 }
 
 template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, int64 value,
-                                      Literal* literal) {
+/* static */ void LiteralUtil::Resize<int32>(int64 num_elements, int32 value,
+                                             Literal* literal) {
   CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_int64>(literal);
-  repeated_field->Resize(num_elements, value);
+  literal->mutable_s32s()->Resize(num_elements, value);
 }
 
 template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, uint64 value,
-                                      Literal* literal) {
+/* static */ void LiteralUtil::Resize<uint32>(int64 num_elements, uint32 value,
+                                              Literal* literal) {
   CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_uint64>(literal);
-  repeated_field->Resize(num_elements, value);
+  literal->mutable_u32s()->Resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<int64>(int64 num_elements, int64 value,
+                                             Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_s64s()->Resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<uint64>(int64 num_elements, uint64 value,
+                                              Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_u64s()->Resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<float>(int64 num_elements, float value,
+                                             Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_f32s()->Resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<double>(int64 num_elements, double value,
+                                              Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_f64s()->Resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<half>(int64 num_elements, half value,
+                                            Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_f16s()->resize(num_elements * sizeof(half));
+  auto data = GetMutableArraySlice<half>(literal);
+  for (int i = 0; i < num_elements; i++) {
+    data[i] = value;
+  }
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 21bb2e46cf2ebcd72bcce393a1e5526f41757544..2da010d56e38c18aed1362a4d2cff1708740ffe9 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -100,6 +101,31 @@ class LiteralUtil {
           values,
       const Layout& layout);
 
+  // Create a new Literal object with the shape specified as parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromShape(const Shape& shape);
+
+  // Create a new Literal object with its values havings the primitive_type
+  // type, and with dimensions defined by the dimensions parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromDimensions(
+      PrimitiveType primitive_type,
+      tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Copies the values from src_literal, starting at src_base shape indexes,
+  // to dest_literal, starting at dest_base, where the copy size in each
+  // dimension is specified by copy_size.
+  // The src_literal and dest_literal must have the same primitive type,
+  // src_base+copy_size must fit the source literal dimensions, as well as
+  // dest_base+copy_size must fit the destination literal dimensions.
+  static Status Copy(const Literal& src_literal,
+                     tensorflow::gtl::ArraySlice<int64> src_base,
+                     Literal* dest_literal,
+                     tensorflow::gtl::ArraySlice<int64> dest_base,
+                     tensorflow::gtl::ArraySlice<int64> copy_size);
+
   // Creates a new value that has the equivalent value as literal, but conforms
   // to new_layout; e.g. a literal matrix that was in {0, 1} minor-to-major
   // dimension layout can be re-layed-out as {1, 0} minor-to-major dimension
@@ -213,6 +239,11 @@ class LiteralUtil {
   // Clones literal into an owned unique_ptr version.
   static std::unique_ptr<Literal> CloneToUnique(const Literal& literal);
 
+  // Returns the linear index of the given index within the literal's
+  // element_type repeated field.
+  static int64 LinearIndex(const Literal& literal,
+                           tensorflow::gtl::ArraySlice<int64> multi_index);
+
   // Gets or sets an element in the literal at the given index. The index is
   // CHECKed against the dimension sizes.
   template <typename NativeT>
@@ -223,6 +254,12 @@ class LiteralUtil {
                   tensorflow::gtl::ArraySlice<int64> multi_index,
                   NativeT value);
 
+  // Retrieves the mutable array slice interface which can be used to manipulate
+  // pre-allocated literal values.
+  template <typename NativeT>
+  static tensorflow::gtl::MutableArraySlice<NativeT> GetMutableArraySlice(
+      Literal* literal);
+
   // Returns the element value at index (0, ..., 0), however many zeroes are
   // required for that index.
   template <typename NativeT>
@@ -257,9 +294,8 @@ class LiteralUtil {
   // like representation in a protobuf).
   static void EachCellAsString(
       const Literal& literal,
-      std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                         const string& value)>
-          per_cell);
+      const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                               const string& value)>& per_cell);
   template <typename NativeT>
   static void EachCell(
       const Literal& literal,
@@ -315,6 +351,14 @@ class LiteralUtil {
                                               const Layout& layout,
                                               Literal* literal);
 
+  // Populates literal values by calling the generator function for every cell
+  // in the literal object.
+  template <typename NativeT>
+  static Status Populate(
+      Literal* literal,
+      const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
+          generator);
+
   // Creates a Literal of the given dimensions with all elements set to the
   // given value.
   template <typename NativeT>
@@ -383,70 +427,73 @@ class LiteralUtil {
     static_assert(!std::is_same<NativeT, NativeT>::value,
                   "Cannot map native type to primitive type.");
   }
-  template <typename NativeT>
-  static tensorflow::protobuf::RepeatedField<NativeT>* GetMutableRepeatedField(
-      Literal* literal) {
-    // Make the expression depend on the template parameter NativeT so
-    // that this compile-time error only apperas if this function is
-    // instantiated with some concrete type that is not specialized
-    // below.
-    static_assert(!std::is_same<NativeT, NativeT>::value,
-                  "Cannot map native type to primitive type.");
-  }
 
-  // Returns the linear index of the given index within the literal's
-  // element_type repeated field.
-  static int64 LinearIndex(const Literal& literal,
-                           tensorflow::gtl::ArraySlice<int64> multi_index);
+  // Internal template helper for the Copy() API, matching its arguments one by
+  // one.
+  template <typename T>
+  static Status CopyRange(const Literal& src_literal,
+                          tensorflow::gtl::ArraySlice<int64> src_base,
+                          Literal* dest_literal,
+                          tensorflow::gtl::ArraySlice<int64> dest_base,
+                          tensorflow::gtl::ArraySlice<int64> copy_size);
+
+  // Utility structure which is used to create the optimal configuration for
+  // a ShapeUtil::ForEachIndex() scan across two literals.
+  struct StrideConfig {
+    StrideConfig(const Shape& source_shape, const Shape& dest_shape,
+                 tensorflow::gtl::ArraySlice<int64> dimensions);
+
+    // The dimensions of the stride operation. Essentially every dimension
+    // will be iterated from base[i] to base[i]+dimensions[i], in step[i]
+    // steps.
+    tensorflow::gtl::ArraySlice<int64> dimensions;
+    DimensionVector base;
+    DimensionVector step;
+    int64 minor_dimension = 0;
+    // The size of the strides for source and destination. One of the two
+    // (the one looping through its most minor dimension) will be 1, while
+    // the other will be the stride size at the dimension matching the other
+    // shape most minor dimension being scanned.
+    int64 dest_stride = 1;
+    int64 source_stride = 1;
+    // The size of the inner loop on the most minor dimension.
+    int64 minor_loop_size = 1;
+  };
 
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralUtil);
 };
 
 // Declarations of template specializations for GetArraySlice and
-// GetMutableRepeatedField. The specializations map native type to XLA primitive
+// GetMutableArraySlice. The specializations map native type to XLA primitive
 // type.
 template <>
 /* static */ tensorflow::gtl::ArraySlice<bool> LiteralUtil::GetArraySlice<bool>(
     const Literal& literal);
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<bool>*
-LiteralUtil::GetMutableRepeatedField<bool>(Literal* literal);
+/* static */ tensorflow::gtl::ArraySlice<uint8>
+LiteralUtil::GetArraySlice<uint8>(const Literal& literal);
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint32>
-LiteralUtil::GetArraySlice<uint32>(const Literal& literal);
+/* static */ tensorflow::gtl::ArraySlice<int8> LiteralUtil::GetArraySlice<int8>(
+    const Literal& literal);
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<uint32>*
-LiteralUtil::GetMutableRepeatedField<uint32>(Literal* literal);
+/* static */ tensorflow::gtl::ArraySlice<uint32>
+LiteralUtil::GetArraySlice<uint32>(const Literal& literal);
 
 template <>
 /* static */ tensorflow::gtl::ArraySlice<uint64>
 LiteralUtil::GetArraySlice<uint64>(const Literal& literal);
 
-template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_uint64>(
-    Literal* literal);
-
 template <>
 /* static */ tensorflow::gtl::ArraySlice<int32>
 LiteralUtil::GetArraySlice<int32>(const Literal& literal);
 
-template <>
-/* static */ tensorflow::protobuf::RepeatedField<int32>*
-LiteralUtil::GetMutableRepeatedField<int32>(Literal* literal);
-
 template <>
 /* static */ tensorflow::gtl::ArraySlice<int64>
 LiteralUtil::GetArraySlice<int64>(const Literal& literal);
 
-template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_int64>(
-    Literal* literal);
-
 template <>
 /* static */ inline tensorflow::gtl::ArraySlice<float>
 LiteralUtil::GetArraySlice<float>(const Literal& literal) {
@@ -454,22 +501,98 @@ LiteralUtil::GetArraySlice<float>(const Literal& literal) {
   return literal.f32s();
 }
 
-template <>
-/* static */ tensorflow::protobuf::RepeatedField<float>*
-LiteralUtil::GetMutableRepeatedField<float>(Literal* literal);
-
 template <>
 /* static */ tensorflow::gtl::ArraySlice<double>
 LiteralUtil::GetArraySlice<double>(const Literal& literal);
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<double>*
-LiteralUtil::GetMutableRepeatedField<double>(Literal* literal);
+/* static */ tensorflow::gtl::ArraySlice<half>
+LiteralUtil::GetArraySlice<half>(const Literal& literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<bool>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<int8>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<uint8>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<int32>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<uint32>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<int64>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<uint64>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<float>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<double>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<half>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<bool>(int64 num_elements, bool value,
+                                            Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<int8>(int64 num_elements, int8 value,
+                                            Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<uint8>(int64 num_elements, uint8 value,
+                                             Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<int32>(int64 num_elements, int32 value,
+                                             Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<uint32>(int64 num_elements, uint32 value,
+                                              Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<int64>(int64 num_elements, int64 value,
+                                             Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<uint64>(int64 num_elements, uint64 value,
+                                              Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<float>(int64 num_elements, float value,
+                                             Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<double>(int64 num_elements, double value,
+                                              Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<half>(int64 num_elements, half value,
+                                            Literal* literal);
 
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::CreateR0(NativeT value) {
   auto literal = MakeUnique<Literal>();
-  PopulateR0(value, literal.get());
+  PopulateR0<NativeT>(value, literal.get());
   return literal;
 }
 
@@ -695,12 +818,20 @@ template <>
   return literal.u8s()[linear_index];
 }
 
+template <>
+/* static */ inline half LiteralUtil::Get<half>(
+    const Literal& literal, tensorflow::gtl::ArraySlice<int64> multi_index) {
+  CHECK(literal.shape().element_type() == F16);
+  int64 linear_index = LinearIndex(literal, multi_index);
+  return GetArraySlice<half>(literal)[linear_index];
+}
+
 template <typename NativeT>
 /* static */ void LiteralUtil::Set(
     Literal* literal, tensorflow::gtl::ArraySlice<int64> multi_index,
     NativeT value) {
   int64 linear_index = LinearIndex(*literal, multi_index);
-  GetMutableRepeatedField<NativeT>(literal)->Set(linear_index, value);
+  GetMutableArraySlice<NativeT>(literal).at(linear_index) = value;
 }
 
 template <>
@@ -760,44 +891,12 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR0(NativeT value, Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {});
-  tensorflow::protobuf::RepeatedField<NativeT>* repeated_field =
-      GetMutableRepeatedField<NativeT>(literal);
-  repeated_field->Add(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<uint8>(uint8 value,
-                                                        Literal* literal) {
+/* static */ inline void LiteralUtil::PopulateR0(NativeT value,
+                                                 Literal* literal) {
   *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<uint8>(), {});
-  literal->mutable_u8s()->push_back(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<int8>(int8 value,
-                                                       Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<int8>(), {});
-  literal->mutable_u8s()->push_back(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<uint64>(uint64 value,
-                                                         Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<uint64>(), {});
-  literal->mutable_u64s()->Add(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<int64>(int64 value,
-                                                        Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<int64>(), {});
-  literal->mutable_s64s()->Add(value);
+      ShapeUtil::MakeShape(
+              primitive_util::NativeToPrimitiveType<NativeT>(), {});
+  Resize<NativeT>(1, value, literal);
 }
 
 template <typename NativeT>
@@ -944,65 +1043,72 @@ template <typename NativeT>
                                   literal);
 }
 
+template <typename NativeT>
+/* static */ Status LiteralUtil::Populate(
+    Literal* literal,
+    const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
+        generator) {
+  const Shape& shape = literal->shape();
+  int64 rank = ShapeUtil::Rank(shape);
+  TF_RET_CHECK(shape.element_type() ==
+               primitive_util::NativeToPrimitiveType<NativeT>());
+  tensorflow::gtl::MutableArraySlice<NativeT> data =
+      GetMutableArraySlice<NativeT>(literal);
+  if (rank > 0) {
+    StrideConfig stride_config(shape, shape, AsInt64Slice(shape.dimensions()));
+    DimensionVector minor_scan_indexes(rank, 0);
+    int64 minor_dimension_size =
+        ShapeUtil::GetDimension(shape, stride_config.minor_dimension);
+
+    auto init_function = [&](const std::vector<int64>& indexes) {
+      int64 index = LinearIndex(*literal, indexes);
+      std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
+      for (int64 i = 0; i < minor_dimension_size; ++i) {
+        minor_scan_indexes[stride_config.minor_dimension] = i;
+        data.at(index + i) = generator(minor_scan_indexes);
+      }
+      return true;
+    };
+    ShapeUtil::ForEachIndex(shape, stride_config.base, stride_config.dimensions,
+                            stride_config.step, init_function);
+  } else {
+    data.at(0) = generator({});
+  }
+  return Status::OK();
+}
+
 template <typename NativeT>
 /* static */ void LiteralUtil::PopulateWithValue(
     NativeT value, tensorflow::gtl::ArraySlice<int64> dimensions,
     Literal* literal) {
   *literal->mutable_shape() = ShapeUtil::MakeShape(
       primitive_util::NativeToPrimitiveType<NativeT>(), dimensions);
-  tensorflow::protobuf::RepeatedField<NativeT>* repeated_field =
-      GetMutableRepeatedField<NativeT>(literal);
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal->shape()); ++i) {
-    repeated_field->Add(value);
-  }
+  Resize<NativeT>(ShapeUtil::ElementsIn(literal->shape()), value, literal);
 }
 
-template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    int64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal);
-
-template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    uint64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal);
-
 template <typename NativeSrcT, typename NativeDestT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::Convert(
     const Literal& literal) {
+  const Shape& shape = literal.shape();
   auto result_literal = MakeUnique<Literal>();
-  Shape result_shape = literal.shape();
-  result_shape.set_element_type(
+  Shape* result_shape = result_literal->mutable_shape();
+  *result_shape = shape;
+  result_shape->set_element_type(
       primitive_util::NativeToPrimitiveType<NativeDestT>());
-  *result_literal->mutable_shape() = result_shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(result_shape),
+  LiteralUtil::Reserve(ShapeUtil::ElementsIn(*result_shape),
                        result_literal.get());
-  LiteralUtil::EachCell<NativeSrcT>(
-      literal,
-      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeSrcT value) {
-        LiteralUtil::Set<NativeDestT>(result_literal.get(), indices,
-                                      static_cast<NativeDestT>(value));
-      });
+  tensorflow::gtl::ArraySlice<NativeSrcT> src_data =
+      GetArraySlice<NativeSrcT>(literal);
+  tensorflow::gtl::MutableArraySlice<NativeDestT> dest_data =
+      GetMutableArraySlice<NativeDestT>(result_literal.get());
+  int64 num_elements = ShapeUtil::ElementsIn(shape);
+
+  for (int64 i = 0; i < num_elements; ++i) {
+    dest_data[i] = static_cast<NativeDestT>(src_data[i]);
+  }
   return result_literal;
 }
 
-template <typename NativeT>
-/* static */ void LiteralUtil::Resize(int64 num_elements, NativeT value,
-                                      Literal* literal) {
-  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  tensorflow::protobuf::RepeatedField<NativeT>* repeated_field =
-      GetMutableRepeatedField<NativeT>(literal);
-  repeated_field->Resize(num_elements, value);
-}
-
-template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, int64 value,
-                                      Literal* literal);
-
-template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, uint64 value,
-                                      Literal* literal);
-
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal>
 LiteralUtil::CreateFullWithMonotonicDim0MajorLayout(
@@ -1022,10 +1128,7 @@ LiteralUtil::CreateFullWithMonotonicDim0MajorLayout(
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::Replicate(
     const Literal& input, int64 times) {
-  // Ranks greater than 8 are very rare, so use InlinedVector<int64, 8> to store
-  // the bounds and indices.
-  static constexpr int kInlineRank = 8;
-  tensorflow::gtl::InlinedVector<int64, kInlineRank> bounds = {times};
+  DimensionVector bounds = {times};
   bounds.reserve(input.shape().dimensions_size() + 1);
   for (int64 bound : input.shape().dimensions()) {
     bounds.push_back(bound);
@@ -1039,8 +1142,7 @@ template <typename NativeT>
   }
   Reserve(elements, literal.get());
 
-  tensorflow::gtl::InlinedVector<int64, kInlineRank> output_indices(
-      bounds.size(), 0);
+  DimensionVector output_indices(bounds.size(), 0);
   tensorflow::gtl::ArraySlice<int64> input_indices = output_indices;
   input_indices.remove_prefix(1);
 
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 91971c3e24c326148322202ffb684285d980d4c7..9a09822174d9c93c8195af193f34017268bbc503 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -103,6 +105,9 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
 
   auto f32_lit = LiteralUtil::CreateR0<float>(3.14f);
   ASSERT_EQ("3.14", LiteralUtil::ToString(*f32_lit));
+
+  auto f16_lit = LiteralUtil::CreateR0<half>(static_cast<half>(0.5f));
+  ASSERT_EQ("0.5", LiteralUtil::ToString(*f16_lit));
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
@@ -371,6 +376,15 @@ TEST_F(LiteralUtilTest, IsAll) {
   EXPECT_FALSE(
       LiteralUtil::IsAll(*LiteralUtil::CreateR2<uint64>({{9, 8}, {8, 8}}), 8));
 
+  half h8(8.0f);
+  half h9(9.0f);
+  EXPECT_TRUE(
+      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h8}}), 8));
+  EXPECT_FALSE(
+      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h9}}), 8));
+  EXPECT_FALSE(
+      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h9}, {h8}}), 8));
+
   auto uint64_max = std::numeric_limits<uint64>::max();
   EXPECT_FALSE(LiteralUtil::IsAll(
       *LiteralUtil::CreateR2<uint64>(
@@ -467,6 +481,26 @@ TEST_F(LiteralUtilTest, ReshapeR4) {
   EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
 }
 
+TEST_F(LiteralUtilTest, ReshapeR4Dim0Minor) {
+  // clang-format off
+  // F32[1x3x2x4]
+  auto original = LiteralUtil::CreateR4WithLayout<float>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0minor_);
+  // F32[1x3x4x2]
+  auto expected = LiteralUtil::CreateR3WithLayout<float>({
+    {{10, 11}, {12, 13}, {14, 15}, {16, 17}},
+    {{18, 19}, {20, 21}, {22, 23}, {24, 25}},
+    {{26, 27}, {28, 29}, {30, 31}, {32, 33}},
+  }, layout_r3_dim0major_);
+  // clang-format on
+  auto reshape = LiteralUtil::Reshape(*original, {3, 4, 2}).ConsumeValueOrDie();
+
+  EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
+}
+
 TEST_F(LiteralUtilTest, TransposeR0) {
   auto original = LiteralUtil::CreateR0<float>(1.7f);
   auto reshape = LiteralUtil::Transpose(*original, /*permutation=*/{});
@@ -637,6 +671,30 @@ TEST_F(LiteralUtilTest, PopulateWithValueR2U64) {
   EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
 }
 
+TEST_F(LiteralUtilTest, PopulateWithValueR0F16) {
+  Literal output;
+  half h(0.25f);
+  LiteralUtil::PopulateWithValue<half>(h, {}, &output);
+  auto expected = LiteralUtil::CreateR0<half>(h);
+  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+}
+
+TEST_F(LiteralUtilTest, PopulateWithValueR1F16) {
+  Literal output;
+  half h(0.5f);
+  LiteralUtil::PopulateWithValue<half>(h, {3}, &output);
+  auto expected = LiteralUtil::CreateR1<half>({h, h, h});
+  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+}
+
+TEST_F(LiteralUtilTest, PopulateWithValueR2F16) {
+  Literal output;
+  half h(2.0f);
+  LiteralUtil::PopulateWithValue<half>(h, {2, 2}, &output);
+  auto expected = LiteralUtil::CreateR2<half>({{h, h}, {h, h}});
+  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+}
+
 TEST_F(LiteralUtilTest, ReplicateR2U32) {
   auto input = LiteralUtil::CreateR2<uint32>(
       {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
@@ -648,5 +706,156 @@ TEST_F(LiteralUtilTest, ReplicateR2U32) {
   EXPECT_TRUE(LiteralUtil::Equal(*output, *expected));
 }
 
+TEST_F(LiteralUtilTest, Copy) {
+  const int64 dimensions[] = {17, 15, 34, 21};
+  const int64 layouts[][4] = {
+      {3, 2, 1, 0}, {0, 2, 1, 3}, {0, 1, 2, 3}, {2, 0, 3, 1}, {1, 3, 0, 2}};
+  for (const auto& layout : layouts) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), dimensions, layout);
+    auto blank = LiteralUtil::CreateFromShape(shape);
+    auto source = LiteralUtil::CreateFromShape(shape);
+    const int64 zero_base[] = {0, 0, 0, 0};
+    const int64 step[] = {1, 1, 1, 1};
+    uint32 seqnr = 0;
+    auto init_proc = [&](const std::vector<int64>& indexes) {
+      LiteralUtil::Set(source.get(), indexes, ++seqnr);
+      return true;
+    };
+
+    ShapeUtil::ForEachIndex(source->shape(), zero_base, dimensions, step,
+                            init_proc);
+
+    const int64 src_base[] = {3, 1, 5, 7};
+    const int64 dest_base[] = {6, 4, 12, 2};
+    const int64 copy_size[] = {7, 8, 11, 9};
+
+    TF_EXPECT_OK(LiteralUtil::Copy(*source, src_base, blank.get(), dest_base,
+                                   copy_size));
+    std::vector<int64> source_indexes(TF_ARRAYSIZE(dimensions), 0);
+    std::vector<int64> blank_indexes(TF_ARRAYSIZE(dimensions), 0);
+    bool matched = true;
+    auto check_proc = [&](const std::vector<int64>& indexes) {
+      std::copy(indexes.begin(), indexes.end(), source_indexes.begin());
+      std::transform(source_indexes.begin(), source_indexes.end(), src_base,
+                     source_indexes.begin(), std::plus<int64>());
+      std::copy(indexes.begin(), indexes.end(), blank_indexes.begin());
+      std::transform(blank_indexes.begin(), blank_indexes.end(), dest_base,
+                     blank_indexes.begin(), std::plus<int64>());
+      auto bval = LiteralUtil::Get<uint32>(*blank, blank_indexes);
+      matched = (bval != 0 &&
+                 bval == LiteralUtil::Get<uint32>(*source, source_indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(source->shape(), zero_base, copy_size, step,
+                            check_proc);
+    EXPECT_TRUE(matched);
+  }
+}
+
+TEST_F(LiteralUtilTest, CopyScalars) {
+  auto zero = LiteralUtil::CreateR0<uint32>(0);
+  auto nine = LiteralUtil::CreateR0<uint32>(9);
+  TF_EXPECT_OK(LiteralUtil::Copy(*nine, {}, zero.get(), {}, {}));
+  EXPECT_TRUE(LiteralUtil::Equal(*zero, *nine));
+
+  auto vect = LiteralUtil::CreateR1<uint32>({3, 4, 9, 12, 5, 17, 21});
+  TF_EXPECT_OK(LiteralUtil::Copy(*vect, {5}, zero.get(), {}, {}));
+  EXPECT_EQ(LiteralUtil::Get<uint32>(*zero, {}), 17);
+  TF_EXPECT_OK(LiteralUtil::Copy(*zero, {}, vect.get(), {4}, {}));
+  EXPECT_EQ(LiteralUtil::Get<uint32>(*vect, {4}), 17);
+}
+
+TEST_F(LiteralUtilTest, F16) {
+  // Verify that the internal data views are consistent and that they
+  // are in little endian format
+  // TODO - modify if we make the data format machine endianess dependent
+  auto m1 = LiteralUtil::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2}));
+  Literal* l1 = m1.get();
+  const char* d1 = (const char*)LiteralUtil::InternalData(*l1);
+  EXPECT_EQ(d1[0], 0);
+  EXPECT_EQ(d1[1], 0);
+  EXPECT_EQ(d1[2], 0);
+  EXPECT_EQ(d1[3], 0);
+  EXPECT_EQ(d1[4], 0);
+  EXPECT_EQ(d1[5], 0);
+  EXPECT_EQ(d1[6], 0);
+  EXPECT_EQ(d1[7], 0);
+  EXPECT_EQ(LiteralUtil::InternalData(*l1),
+            LiteralUtil::MutableInternalData(l1));
+
+  half h1(1.0f);
+  half h2(2.0f);
+  auto m2 = LiteralUtil::CreateR2<half>({{h1, h2}, {h2, h1}});
+  Literal* l2 = m2.get();
+  const char* d2 = (const char*)LiteralUtil::InternalData(*l2);
+  EXPECT_EQ(d2[0], 0);
+  EXPECT_EQ(d2[1], 0x3C);
+  EXPECT_EQ(d2[2], 0);
+  EXPECT_EQ(d2[3], 0x40);
+  EXPECT_EQ(d2[4], 0);
+  EXPECT_EQ(d2[5], 0x40);
+  EXPECT_EQ(d2[6], 0);
+  EXPECT_EQ(d2[7], 0x3C);
+  EXPECT_EQ(LiteralUtil::InternalData(*l2),
+            LiteralUtil::MutableInternalData(l2));
+}
+
+TEST_F(LiteralUtilTest, Populate) {
+  struct PopulateData {
+    std::vector<int64> dimensions;
+    std::vector<int64> layout;
+  } populate_data[] = {
+      {{}, {}},
+      {{16}, {0}},
+      {{4, 16}, {1, 0}},
+      {{21, 12}, {0, 1}},
+      {{6, 11, 17}, {2, 0, 1}},
+      {{6, 11, 5, 17}, {3, 2, 0, 1}},
+  };
+  for (const auto& data : populate_data) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
+        data.layout);
+    auto literal = LiteralUtil::CreateFromShape(shape);
+    auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> uint32 {
+      // Offsets from linear index just to avoid R0 literals to be initialized
+      // with zero.
+      return LiteralUtil::LinearIndex(*literal, indexes) + 17;
+    };
+    TF_EXPECT_OK(LiteralUtil::Populate<uint32>(literal.get(), generator));
+
+    std::vector<int64> zero_base(data.dimensions.size(), 0);
+    std::vector<int64> step(data.dimensions.size(), 1);
+    bool matched = true;
+    auto check_function = [&](const std::vector<int64>& indexes) {
+      auto value = LiteralUtil::Get<uint32>(*literal, indexes);
+      matched = matched && (value == generator(indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
+                            check_function);
+    EXPECT_TRUE(matched);
+  }
+}
+
+TEST_F(LiteralUtilTest, ConvertR4) {
+  // clang-format off
+  auto original = LiteralUtil::CreateR4WithLayout<int8>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0major_);
+  auto expected = LiteralUtil::CreateR4WithLayout<uint32>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0major_);
+  // clang-format on
+  auto converted = LiteralUtil::Convert<int8, uint32>(*original);
+
+  EXPECT_TRUE(LiteralUtil::Equal(*expected, *converted));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc
index cd7c42f6e17e15b5e1c6ebfa1f24a40a9003a63e..0d4ddc239243b79d47b6a1672b65abe9b23e7b52 100644
--- a/tensorflow/compiler/xla/metric_table_report.cc
+++ b/tensorflow/compiler/xla/metric_table_report.cc
@@ -38,7 +38,8 @@ void MetricTableReport::SetEntryName(string entry_name) {
 
 void MetricTableReport::SetShowAllEntries() {
   max_entries_to_show_ = std::numeric_limits<int64>::max();
-  max_metric_proportion_to_show = 1.1;  // more than 100%
+  max_entries_per_category_to_show_ = std::numeric_limits<int64>::max();
+  max_metric_proportion_to_show_ = 1.1;  // more than 100%
 }
 
 void MetricTableReport::SetShowCategoryTable() { show_category_table_ = true; }
@@ -141,7 +142,7 @@ void MetricTableReport::AppendCategoryTable() {
   int64 categories_shown = 0;
   for (const auto& category : categories) {
     if (categories_shown >= max_entries_to_show_ ||
-        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show) {
+        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show_) {
       break;
     }
     ++categories_shown;
@@ -156,15 +157,14 @@ void MetricTableReport::AppendCategoryTable() {
                                    entry_name_, ")");
     AppendTableRow(text, category.metric_sum, metric_sum);
 
-    // Show the top few entries in the category.
-    const int64 kMaxToShow = 5;
+    // Show the top entries in the category.
     const char* const kIndentPrefix = "                              * ";
-    int64 entries_to_show =
-        std::min<int64>(kMaxToShow, category.entries.size());
-    if (category.entries.size() == kMaxToShow + 1) {
+    int64 entries_to_show = std::min<int64>(max_entries_per_category_to_show_,
+                                            category.entries.size());
+    if (category.entries.size() == entries_to_show + 1) {
       // May as well show the last entry on the line that would otherwise say
       // that there is a single entry not shown.
-      entries_to_show = category.entries.size();
+      ++entries_to_show;
     }
     for (int64 i = 0; i < entries_to_show; ++i) {
       AppendLine(kIndentPrefix, MetricPercent(category.entries[i]->metric), " ",
@@ -193,7 +193,7 @@ void MetricTableReport::AppendEntryTable() {
   int64 entries_shown = 0;
   for (const auto& entry : entries_) {
     if (entries_shown >= max_entries_to_show_ ||
-        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show) {
+        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show_) {
       break;
     }
     ++entries_shown;
diff --git a/tensorflow/compiler/xla/metric_table_report.h b/tensorflow/compiler/xla/metric_table_report.h
index e967627bff4446a695bfae514faac4b1acca4968..818fb1d3fe0b8bbe1a8eba363ff6445e2f3df9d2 100644
--- a/tensorflow/compiler/xla/metric_table_report.h
+++ b/tensorflow/compiler/xla/metric_table_report.h
@@ -103,6 +103,7 @@ class MetricTableReport {
  private:
   static constexpr double kDefaultMaxMetricProportionToShow = 0.99;
   static constexpr int64 kDefaultMaxEntriesToShow = 100;
+  static constexpr int64 kDefaultMaxEntriesPerCategoryToShow = 5;
 
   // Append all parameters to the report.
   template <typename... Args>
@@ -162,7 +163,8 @@ class MetricTableReport {
 
   // These members control how many categories and entries to show in tables.
   int64 max_entries_to_show_ = kDefaultMaxEntriesToShow;
-  double max_metric_proportion_to_show = kDefaultMaxMetricProportionToShow;
+  int64 max_entries_per_category_to_show_ = kDefaultMaxEntriesPerCategoryToShow;
+  double max_metric_proportion_to_show_ = kDefaultMaxMetricProportionToShow;
 
   // The report that is being created.
   string report_;
diff --git a/tensorflow/compiler/xla/port/BUILD b/tensorflow/compiler/xla/port/BUILD
deleted file mode 100644
index 6fc5f1185c9d56075f18928e4b2c8e3819cf9ddd..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/port/BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-# Filegroup used to collect source files for dependency checking.
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-    visibility = ["//tensorflow/compiler/xla:internal"],
-)
-
-cc_library(
-    name = "initialize",
-    hdrs = ["initialize.h"],
-    visibility = [
-        "//tensorflow/compiler/xla:__subpackages__",
-    ],
-)
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index e3909ae8e9736351d3ee91332572b5db62727289..e4e37177a2d74e6da20300f1439942a146ad8d49 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -78,6 +78,11 @@ PrimitiveType NativeToPrimitiveType<double>() {
   return F64;
 }
 
+template <>
+PrimitiveType NativeToPrimitiveType<half>() {
+  return F16;
+}
+
 bool IsFloatingPointType(PrimitiveType type) {
   return type == F16 || type == F32 || type == F64;
 }
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 78f0ee6f592d9b9ec2ed85f23297634c5e2e4d41..162a11c7d2966346979b98c804917203f82c806c 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -75,6 +75,8 @@ template <>
 PrimitiveType NativeToPrimitiveType<float>();
 template <>
 PrimitiveType NativeToPrimitiveType<double>();
+template <>
+PrimitiveType NativeToPrimitiveType<half>();
 
 bool IsFloatingPointType(PrimitiveType type);
 
@@ -150,6 +152,10 @@ template <>
 struct PrimitiveTypeToNative<F64> {
   using type = double;
 };
+template <>
+struct PrimitiveTypeToNative<F16> {
+  using type = half;
+};
 
 }  // namespace primitive_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 5630033ac89b3aefbb8503f8e04fe268f9ab4da6..4194d5fc6be0ad552e9fe6dd14b51fa0a67f2eca 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -180,14 +180,28 @@ ReferenceUtil::ReduceWindow4DGeneric(
     const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
   std::vector<int64> dim_lengths{operand.n1(), operand.n2(), operand.n3(),
                                  operand.n4()};
-  auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
+  return ReduceWindow4DGeneric(
+      operand, init, reduce_func, window, stride,
+      xla::MakePadding(dim_lengths, window, stride, padding));
+}
+
+/* static */ std::unique_ptr<Array4D<float>>
+ReferenceUtil::ReduceWindow4DGeneric(
+    const Array4D<float>& operand, float init,
+    const std::function<float(float, float)>& reduce_func,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride,
+    const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding) {
+  std::vector<int64> dim_lengths{operand.n1(), operand.n2(), operand.n3(),
+                                 operand.n4()};
 
   std::vector<int64> window_counts(window.size(), 0);
   std::vector<int64> pad_low(window.size(), 0);
   for (int64 i = 0; i < window.size(); ++i) {
+    int64 padded_width = padding[i].first + dim_lengths[i] + padding[i].second;
     window_counts[i] =
-        WindowCount(dim_lengths[i], window[i], stride[i], padding);
-    pad_low[i] = padding_both[i].first;
+        window_util::StridedBound(padded_width, window[i], stride[i]);
+    pad_low[i] = padding[i].first;
   }
   auto result = MakeUnique<Array4D<float>>(window_counts[0], window_counts[1],
                                            window_counts[2], window_counts[3]);
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index eb1eea7fc4c68a3a29cdf8b7eef9773b990b1bbc..f58f0bdc9f51dff62c10dda4aba7aac03e689ce7 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -162,6 +162,12 @@ class ReferenceUtil {
       const std::function<float(float, float)>& reduce_func,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
+  static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
+      const Array4D<float>& operand, float init,
+      const std::function<float(float, float)>& reduce_func,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride,
+      const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding);
 
   // Performs select and scatter with Greater Than or equal as the select, plus
   // as the scatter, and Same Padding.
@@ -400,7 +406,46 @@ class ReferenceUtil {
                                    const PaddingConfig& padding,
                                    const float pad);
 
+  // ApplyElementwise2D(f, x, y, ...) returns the Array2D formed by running
+  // f(x[i], y[i], ...) for each array element in the Array2Ds x, y, ....
+  //
+  // The given arrays must have the same size and element type, and the return
+  // type of f must be implicitly convertible to the arrays' element type.
+  //
+  // Example usage:
+  //
+  //   Array2D<float> x, y, z = ...;
+  //   std::unique_ptr<Array2D> result = ReferenceUtil::ApplyElementwise2D(
+  //     [](float a, float b, float c) { return a * b + c; }, x, y, z);
+  //
+  template <typename F, typename T1, typename... Ts>
+  static std::unique_ptr<Array2D<T1>> ApplyElementwise2D(
+      F&& f, const Array2D<T1>& array1, const Array2D<Ts>&... arrays) {
+    AssertSameSize2D(array1, arrays...);
+    auto result = MakeUnique<Array2D<T1>>(array1.n1(), array1.n2());
+    for (int64 i = 0; i < array1.n1(); ++i) {
+      for (int64 j = 0; j < array1.n2(); ++j) {
+        (*result)(i, j) = f(array1(i, j), arrays(i, j)...);
+      }
+    }
+    return result;
+  }
+
  private:
+  template <typename T1, typename T2, typename... Ts>
+  static void AssertSameSize2D(const Array2D<T1>& array1,
+                               const Array2D<T2>& array2,
+                               const Array2D<Ts>&... arrays) {
+    static_assert(std::is_same<T1, T2>::value, "Args must be same type.");
+    CHECK_EQ(array1.n1(), array2.n1());
+    CHECK_EQ(array1.n2(), array2.n2());
+    AssertSameSize2D(array2, arrays...);
+  }
+
+  // Recursive base case for AssertSameSize2D.
+  template <typename Array1>
+  static void AssertSameSize2D(const Array1& array1) {}
+
   TF_DISALLOW_COPY_AND_ASSIGN(ReferenceUtil);
 };
 
diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc
index b0aa55840283c011099ef0f4263307a4ef101382..f839ac019df07c5c5e07eed856ea55463bb3efae 100644
--- a/tensorflow/compiler/xla/reference_util_test.cc
+++ b/tensorflow/compiler/xla/reference_util_test.cc
@@ -52,9 +52,9 @@ class ReferenceUtilTest : public ::testing::Test {
 
 TEST_F(ReferenceUtilTest, TransposeArray2D) {
   auto result = ReferenceUtil::TransposeArray2D(*matrix_);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{1.f, 4.f}, {2.f, 5.f}, {3.f, 6.f}},
-                                       *result_literal, ErrorSpec(0.0001));
+                                       *actual_literal, ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, MatmulArray2D) {
@@ -62,32 +62,32 @@ TEST_F(ReferenceUtilTest, MatmulArray2D) {
       {7.f, 8.f}, {9.f, 10.f}, {11.f, 12.f},
   });
   auto result = ReferenceUtil::MatmulArray2D(*matrix_, rhs);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{58.f, 64.f}, {139.f, 154.f}},
-                                       *result_literal, ErrorSpec(0.0001));
+                                       *actual_literal, ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, ReduceToColArray2D) {
   auto add = [](float lhs, float rhs) { return lhs + rhs; };
   auto result = ReferenceUtil::ReduceToColArray2D(*matrix_, 0.0f, add);
-  auto result_literal = LiteralUtil::CreateR1<float>(*result);
-  LiteralTestUtil::ExpectR1Near<float>({6.f, 15.f}, *result_literal,
+  auto actual_literal = LiteralUtil::CreateR1<float>(*result);
+  LiteralTestUtil::ExpectR1Near<float>({6.f, 15.f}, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, ReduceToRowArray2D) {
   auto add = [](float lhs, float rhs) { return lhs + rhs; };
   auto result = ReferenceUtil::ReduceToRowArray2D(*matrix_, 0.0f, add);
-  auto result_literal = LiteralUtil::CreateR1<float>(*result);
-  LiteralTestUtil::ExpectR1Near<float>({5.f, 7.f, 9.f}, *result_literal,
+  auto actual_literal = LiteralUtil::CreateR1<float>(*result);
+  LiteralTestUtil::ExpectR1Near<float>({5.f, 7.f, 9.f}, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, MapArray2D) {
   auto identity = [](float value) { return log(exp(value)); };
   auto result = ReferenceUtil::MapArray2D(*matrix_, identity);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
-  LiteralTestUtil::ExpectR2NearArray2D(*matrix_, *result_literal,
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  LiteralTestUtil::ExpectR2NearArray2D(*matrix_, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
@@ -96,9 +96,9 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray2D) {
     return value + row + col;
   };
   auto result = ReferenceUtil::MapWithIndexArray2D(*matrix_, add_index);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{1.f, 3.f, 5.f}, {5.f, 7.f, 9.f}},
-                                       *result_literal, ErrorSpec(0.0001));
+                                       *actual_literal, ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, MapArray4D) {
@@ -107,11 +107,11 @@ TEST_F(ReferenceUtilTest, MapArray4D) {
   input->FillWithMultiples(1.0f);
   auto multiply_by_two = [](float value) { return 2 * value; };
   auto result = ReferenceUtil::MapArray4D(*input, multiply_by_two);
-  auto result_literal = LiteralUtil::CreateR4FromArray4D(*result);
+  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result);
 
   Array4D<float> expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5);
   expected.FillWithMultiples(2.0f);
-  LiteralTestUtil::ExpectR4NearArray4D(expected, *result_literal,
+  LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
@@ -124,11 +124,11 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray4D) {
     return value - (3 * 4 * 5 * plane + 4 * 5 * depth + 5 * height + width);
   };
   auto result = ReferenceUtil::MapWithIndexArray4D(*input, subtract_index);
-  auto result_literal = LiteralUtil::CreateR4FromArray4D(*result);
+  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result);
 
   Array4D<float> expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5);
   expected.Fill(0.0f);
-  LiteralTestUtil::ExpectR4NearArray4D(expected, *result_literal,
+  LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
@@ -302,5 +302,17 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithValidPadding) {
                                               ErrorSpec(0.0001));
 }
 
+TEST_F(ReferenceUtilTest, ApplyElementwise2D) {
+  Array2D<float> a({{1, 2}, {3, 4}});
+  Array2D<float> b({{10, 20}, {30, 40}});
+  Array2D<float> c({{100, 200}, {300, 400}});
+
+  auto actual = ReferenceUtil::ApplyElementwise2D(
+      [](float x, float y, float z) { return 100 * x + 10 * y + z; }, a, b, c);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*actual);
+  LiteralTestUtil::ExpectR2Near({{300.f, 600.f}, {900.f, 1200.f}},
+                                *actual_literal, ErrorSpec(0.0001));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index e5a921674f675168b0c30198ce25146d7bc91302..75a0f6f0f3be116343343b6ef45afc3913e35c61 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -195,7 +195,6 @@ cc_library(
     deps = [
         ":hlo",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
     ],
@@ -407,6 +406,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "compile_only_service",
+    srcs = ["compile_only_service.cc"],
+    hdrs = ["compile_only_service.h"],
+    deps = [
+        ":backend",
+        ":compiler",
+        ":computation_layout",
+        ":computation_tracker",
+        ":platform_util",
+        ":service",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
 cc_library(
     name = "cpu_plugin",
     deps = [
@@ -624,6 +644,7 @@ cc_library(
         "buffer_liveness.h",
     ],
     deps = [
+        ":call_graph",
         ":hlo",
         ":hlo_ordering",
         ":liveness_util",
@@ -664,8 +685,8 @@ cc_library(
     ],
     deps = [
         ":buffer_liveness",
-        ":heap_simulator",
         ":hlo",
+        ":hlo_ordering",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -705,50 +726,38 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "heap_simulator",
-    srcs = [
-        "heap_simulator.cc",
-    ],
-    hdrs = [
-        "heap_simulator.h",
-    ],
-    deps = [
-        ":hlo",
-        ":liveness_util",
-        ":logical_buffer",
-        ":tuple_points_to_analysis",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_test(
     name = "heap_simulator_test",
     srcs = ["heap_simulator_test.cc"],
     deps = [
-        ":heap_simulator",
         ":hlo",
+        ":hlo_ordering",
         ":logical_buffer",
         ":tuple_points_to_analysis",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
 
+# The hlo_ordering library contains both hlo_ordering and heap_simulator because
+# they are mutually dependent.
 cc_library(
     name = "hlo_ordering",
     srcs = [
+        "heap_simulator.cc",
         "hlo_ordering.cc",
     ],
     hdrs = [
+        "heap_simulator.h",
         "hlo_ordering.h",
     ],
     deps = [
-        ":heap_simulator",
+        ":call_graph",
         ":hlo",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -858,7 +867,9 @@ cc_library(
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -1190,6 +1201,7 @@ cc_library(
         ":buffer_liveness",
         ":hlo",
         ":hlo_pass",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:status_macros",
@@ -1254,6 +1266,7 @@ cc_library(
         ":hlo_cost_analysis",
         ":hlo_dce",
         ":hlo_ordering",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -1271,6 +1284,7 @@ cc_test(
     deps = [
         ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":hlo_ordering",
         ":hlo_rematerialization",
         "//tensorflow/compiler/xla:shape_util",
@@ -1384,6 +1398,7 @@ cc_test(
         ":cpu_plugin",
         ":hlo",
         ":hlo_cse",
+        ":hlo_matchers",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -1412,6 +1427,28 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "hlo_constant_folding_test",
+    srcs = ["hlo_constant_folding_test.cc"],
+    deps = [
+        ":cpu_plugin",
+        ":hlo",
+        ":hlo_constant_folding",
+        ":hlo_matchers",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = ["device_memory_allocator.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 6acb9bdcbac2e79538d14d94003e15d11058f1a9..3f888b4c2e378bd88fcafa02171fef52ccd758f9 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1,3 +1,4 @@
+
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -51,6 +52,16 @@ bool IsLiteralWithValue(const HloInstruction* operand, int8 value) {
          LiteralUtil::IsAll(operand->literal(), value);
 }
 
+bool IsAll(const HloInstruction* op, int8 value) {
+  if (IsLiteralWithValue(op, value)) {
+    return true;
+  }
+  if (op->opcode() == HloOpcode::kBroadcast && IsAll(op->operand(0), value)) {
+    return true;
+  }
+  return false;
+}
+
 // Returns whether the given transpose produces a result which is bit-wise
 // identical to its operand and thus may be replaced with a bitcast.
 bool TransposeIsBitcast(const HloInstruction* transpose) {
@@ -150,9 +161,17 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       HloComputation* function) override;
 
+  Status HandleReduceWindow(HloInstruction* reduce_window,
+                            HloInstruction* operand, const Window& window,
+                            HloComputation* function) override;
+
   Status HandleReverse(HloInstruction* reverse,
                        HloInstruction* operand) override;
   Status HandleSlice(HloInstruction* slice, HloInstruction* operand) override;
+  Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
+                                  HloInstruction* operand,
+                                  HloInstruction* update,
+                                  HloInstruction* start_indices) override;
 
   Status HandleTranspose(HloInstruction* transpose) override;
 
@@ -214,6 +233,29 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   StatusOr<bool> TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
       HloInstruction* reshape_or_broadcast);
 
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceWithNewInstruction(
+      HloInstruction* old_instruction,
+      std::unique_ptr<HloInstruction> new_instruction) {
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+        old_instruction, std::move(new_instruction)));
+    changed_ = true;
+    return Status::OK();
+  }
+
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceInstruction(HloInstruction* old_instruction,
+                            HloInstruction* new_instruction) {
+    TF_RETURN_IF_ERROR(
+        computation_->ReplaceInstruction(old_instruction, new_instruction));
+    changed_ = true;
+    return Status::OK();
+  }
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -262,8 +304,7 @@ void AlgebraicSimplifierVisitor::ReplaceWithBitcast(
   auto bitcast = computation_->AddInstruction(
       HloInstruction::CreateUnary(instruction->shape(), HloOpcode::kBitcast,
                                   instruction->mutable_operand(0)));
-  TF_CHECK_OK(computation_->ReplaceInstruction(instruction, bitcast));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceInstruction(instruction, bitcast));
 }
 
 bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
@@ -271,9 +312,7 @@ bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
   if (!SameShape(old_instruction, new_instruction)) {
     return false;
   }
-  TF_CHECK_OK(
-      computation_->ReplaceInstruction(old_instruction, new_instruction));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceInstruction(old_instruction, new_instruction));
   return true;
 }
 
@@ -282,12 +321,12 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add,
                                              HloInstruction* rhs) {
   // A + 0 => A
   VLOG(10) << "trying transform [A + 0 => A]: " << add->ToString();
-  if (IsLiteralWithValue(rhs, 0) && ReplaceInstructionIfSameShape(add, lhs)) {
+  if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(add, lhs)) {
     return Status::OK();
   }
   // 0 + A => A
   VLOG(10) << "trying transform [0 + A => A]: " << add->ToString();
-  if (IsLiteralWithValue(lhs, 0) && ReplaceInstructionIfSameShape(add, rhs)) {
+  if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(add, rhs)) {
     return Status::OK();
   }
 
@@ -304,9 +343,32 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy,
 Status AlgebraicSimplifierVisitor::HandleConcatenate(
     HloInstruction* concatenate,
     tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
-  // Unary concatenates are useless.
   if (operands.size() == 1) {
+    // Unary concatenates are useless.
     ReplaceInstructionIfSameShape(concatenate, operands[0]);
+    return Status::OK();
+  }
+  // Filter out and remove empty operands.
+  std::vector<HloInstruction*> nonempty_operands;
+  for (HloInstruction* operand : operands) {
+    if (!ShapeUtil::HasZeroElements(operand->shape())) {
+      nonempty_operands.push_back(operand);
+    }
+  }
+  if (nonempty_operands.size() < operands.size()) {
+    HloInstruction* replacement;
+    if (nonempty_operands.empty()) {
+      replacement = operands[0];
+    } else if (nonempty_operands.size() == 1) {
+      replacement = nonempty_operands[0];
+    } else {
+      replacement =
+          computation_->AddInstruction(concatenate->CloneWithNewOperands(
+              concatenate->shape(), nonempty_operands));
+    }
+    VLOG(10) << "trying to replace " << concatenate->ToString() << " with "
+             << replacement->ToString();
+    ReplaceInstructionIfSameShape(concatenate, replacement);
   }
   return Status::OK();
 }
@@ -316,7 +378,7 @@ Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub,
                                                   HloInstruction* rhs) {
   // A - 0 => A
   VLOG(10) << "trying transform [A - 0 => A]: " << sub->ToString();
-  if (IsLiteralWithValue(rhs, 0) && ReplaceInstructionIfSameShape(sub, lhs)) {
+  if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(sub, lhs)) {
     return Status::OK();
   }
 
@@ -328,8 +390,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide,
                                                 HloInstruction* rhs) {
   // A/1 => A
   VLOG(10) << "trying transform [A/1 => A]: " << divide->ToString();
-  if (IsLiteralWithValue(rhs, 1) &&
-      ReplaceInstructionIfSameShape(divide, lhs)) {
+  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(divide, lhs)) {
     return Status::OK();
   }
 
@@ -340,8 +401,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide,
         computation_->AddInstruction(HloInstruction::CreateBinary(
             divide->shape(), HloOpcode::kSubtract, lhs->mutable_operand(0),
             rhs->mutable_operand(0)));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         divide, HloInstruction::CreateUnary(divide->shape(), HloOpcode::kExp,
                                             subtract));
   }
@@ -368,8 +428,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
       ShapeUtil::HasZeroElements(rhs->shape())) {
     auto zero = computation_->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
   }
 
@@ -378,8 +437,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
     auto new_dot = computation_->AddInstruction(HloInstruction::CreateBinary(
         ShapeUtil::PermuteDimensions({1, 0}, dot->shape()), HloOpcode::kDot,
         rhs->mutable_operand(0), lhs->mutable_operand(0)));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
   }
 
@@ -387,8 +445,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
   //
   // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
   if (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(0) == 1) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateBinary(dot->shape(), HloOpcode::kMultiply,
                                           lhs, rhs));
   }
@@ -412,8 +469,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
     auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
         ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
         {0}, add_reduce_computation));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateReshape(dot->shape(), reduce));
   }
 
@@ -452,8 +508,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
                                {rhs->shape().dimensions(1)}),
           multiply, zero, {0}, add_reduce_computation));
     }
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateReshape(dot->shape(), reduce));
   }
 
@@ -479,8 +534,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
         ShapeUtil::MakeShape(dot->shape().element_type(),
                              {lhs->shape().dimensions(0)}),
         multiply, zero, {1}, add_reduce_computation));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateReshape(dot->shape(), reduce));
   }
   return Status::OK();
@@ -491,14 +545,12 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply,
                                                   HloInstruction* rhs) {
   // A*1 => A
   VLOG(10) << "trying transform [A*1 => A]: " << multiply->ToString();
-  if (IsLiteralWithValue(rhs, 1) &&
-      ReplaceInstructionIfSameShape(multiply, lhs)) {
+  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(multiply, lhs)) {
     return Status::OK();
   }
   // 1*A => A
   VLOG(10) << "trying transform [1*A => A]: " << multiply->ToString();
-  if (IsLiteralWithValue(lhs, 1) &&
-      ReplaceInstructionIfSameShape(multiply, rhs)) {
+  if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(multiply, rhs)) {
     return Status::OK();
   }
   return Status::OK();
@@ -619,8 +671,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
           ShapeUtil::ElementsIn(operand->shape())) {
     VLOG(10) << "transform broadcast(X) -> reshape(X) where "
                 "n(broadcast(X)) == n(X)";
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         broadcast, HloInstruction::CreateReshape(broadcast->shape(), operand));
   }
 
@@ -632,8 +683,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
           ShapeUtil::ElementsIn(operand->shape())) {
     VLOG(10) << "transform broadcast(X) -> transpose(X) where "
                 "n(broadcast(X)) == n(X)";
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         broadcast, HloInstruction::CreateTranspose(broadcast->shape(), operand,
                                                    broadcast->dimensions()));
   }
@@ -653,8 +703,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
       for (auto inserted_index : inserted_indices) {
         dims.erase(dims.begin() + inserted_index);
       }
-      changed_ = true;
-      return computation_->ReplaceWithNewInstruction(
+      return ReplaceWithNewInstruction(
           broadcast,
           HloInstruction::CreateBroadcast(broadcast->shape(),
                                           operand->mutable_operand(0), dims));
@@ -697,65 +746,6 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
-template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-static std::unique_ptr<HloInstruction> ConvertIfTypesMatch(
-    const Literal& src_literal) {
-  CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-
-  return HloInstruction::CreateConstant(
-      LiteralUtil::Convert<typename primitive_util::PrimitiveTypeToNative<
-                               primitive_src_type>::type,
-                           typename primitive_util::PrimitiveTypeToNative<
-                               primitive_dest_type>::type>(src_literal));
-}
-
-template <PrimitiveType primitive_src_type>
-static std::unique_ptr<HloInstruction> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
-  switch (primitive_dest_type) {
-#define CONVERT_IF_TYPES_MATCH(type) \
-  case (type):                       \
-    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
-    CONVERT_IF_TYPES_MATCH(PRED)
-    CONVERT_IF_TYPES_MATCH(S8)
-    CONVERT_IF_TYPES_MATCH(S32)
-    CONVERT_IF_TYPES_MATCH(S64)
-    CONVERT_IF_TYPES_MATCH(U8)
-    CONVERT_IF_TYPES_MATCH(U32)
-    CONVERT_IF_TYPES_MATCH(U64)
-    CONVERT_IF_TYPES_MATCH(F32)
-    CONVERT_IF_TYPES_MATCH(F64)
-#undef CONVERT_IF_TYPES_MATCH
-    // Other types are not yet supported.
-    default:
-      LOG(FATAL) << "Unimplemented: ConvertIfDestTypeMatches for type "
-                 << PrimitiveType_Name(src_literal.shape().element_type());
-  }
-}
-
-static std::unique_ptr<HloInstruction> ConvertIfSrcTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
-  switch (src_literal.shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
-  case (type):                             \
-    return ConvertIfDestTypeMatches<(type)>(src_literal, primitive_dest_type);
-    CONVERT_IF_DEST_TYPE_MATCHES(PRED)
-    CONVERT_IF_DEST_TYPE_MATCHES(S8)
-    CONVERT_IF_DEST_TYPE_MATCHES(S32)
-    CONVERT_IF_DEST_TYPE_MATCHES(S64)
-    CONVERT_IF_DEST_TYPE_MATCHES(U8)
-    CONVERT_IF_DEST_TYPE_MATCHES(U32)
-    CONVERT_IF_DEST_TYPE_MATCHES(U64)
-    CONVERT_IF_DEST_TYPE_MATCHES(F32)
-    CONVERT_IF_DEST_TYPE_MATCHES(F64)
-#undef CONVERT_IF_DEST_TYPE_MATCHES
-    // Other types are not yet supported.
-    default:
-      LOG(FATAL) << "Unimplemented: ConvertIfSrcTypeMatches for type "
-                 << PrimitiveType_Name(src_literal.shape().element_type());
-  }
-}
-
 // A conversion to the same element type as the operand is a nop and can be
 // removed.  A conversion of a constant can be simplified by making a new
 // constant.
@@ -764,16 +754,7 @@ Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert,
   PrimitiveType src_type = operand->shape().element_type();
   PrimitiveType dest_type = convert->shape().element_type();
   if (src_type == dest_type) {
-    changed_ = true;
-    return computation_->ReplaceInstruction(convert, operand);
-  }
-  if (operand->opcode() == HloOpcode::kConstant) {
-    const Literal& src_literal = operand->literal();
-    std::unique_ptr<HloInstruction> new_constant =
-        ConvertIfSrcTypeMatches(src_literal, dest_type);
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(convert,
-                                                   std::move(new_constant));
+    return ReplaceInstruction(convert, operand);
   }
   return Status::OK();
 }
@@ -859,8 +840,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
 
     std::unique_ptr<HloInstruction> slice = HloInstruction::CreateSlice(
         pad->shape(), nonzero_pad, start_indices, end_indices);
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(pad, std::move(slice));
+    return ReplaceWithNewInstruction(pad, std::move(slice));
   }
 
   return Status::OK();
@@ -870,7 +850,7 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power,
                                                HloInstruction* lhs,
                                                HloInstruction* rhs) {
   VLOG(10) << "trying transform [pow(A, 0) => 1]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, 0)) {
+  if (IsAll(rhs, 0)) {
     auto one = HloInstruction::CreateConstant(LiteralUtil::CloneToUnique(
         LiteralUtil::One(power->shape().element_type())));
     std::unique_ptr<HloInstruction> ones;
@@ -880,30 +860,27 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power,
       ones = HloInstruction::CreateBroadcast(
           power->shape(), computation_->AddInstruction(std::move(one)), {});
     }
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(power, std::move(ones));
+    return ReplaceWithNewInstruction(power, std::move(ones));
   }
 
   VLOG(10) << "trying transform [pow(A, 1) => A]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, 1) && ReplaceInstructionIfSameShape(power, lhs)) {
+  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(power, lhs)) {
     return Status::OK();
   }
 
   VLOG(10) << "trying transform [pow(A, 2) => A*A]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, 2)) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+  if (IsAll(rhs, 2)) {
+    return ReplaceWithNewInstruction(
         power, HloInstruction::CreateBinary(power->shape(),
                                             HloOpcode::kMultiply, lhs, lhs));
   }
 
   VLOG(10) << "trying transform [pow(A, -1) => 1/A]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, -1)) {
+  if (IsAll(rhs, -1)) {
     auto* one = computation_->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CloneToUnique(
             LiteralUtil::One(rhs->shape().element_type()))));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kDivide,
                                             one, lhs));
   }
@@ -984,14 +961,12 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   // Delete no-op reshapes, i.e. where shape = operand shape.
   if (SameShape(reshape, operand)) {
     VLOG(10) << "deleting no-op reshape";
-    changed_ = true;
-    return computation_->ReplaceInstruction(reshape, operand);
+    return ReplaceInstruction(reshape, operand);
   }
 
   // Merge reshapes.
   if (HloOpcode::kReshape == operand->opcode()) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reshape, HloInstruction::CreateReshape(reshape->shape(),
                                                operand->mutable_operand(0)));
   }
@@ -1000,8 +975,7 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
     auto opt_dims = ReshapeLeavesDimensionsUnmodified(
         reshape, reshape->operand(0)->dimensions());
     if (opt_dims.first) {
-      changed_ = true;
-      return computation_->ReplaceWithNewInstruction(
+      return ReplaceWithNewInstruction(
           reshape,
           HloInstruction::CreateBroadcast(
               reshape->shape(), reshape->mutable_operand(0)->mutable_operand(0),
@@ -1037,8 +1011,7 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse,
   };
   if (std::all_of(reverse->dimensions().begin(), reverse->dimensions().end(),
                   dim_is_one)) {
-    changed_ = true;
-    return computation_->ReplaceInstruction(reverse, operand);
+    return ReplaceInstruction(reverse, operand);
   }
   return Status::OK();
 }
@@ -1052,12 +1025,22 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice,
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
+    HloInstruction* dynamic_update_slice, HloInstruction* operand,
+    HloInstruction* update, HloInstruction* start_indices) {
+  // DynamicUpdateSlice on a scalar just passes through the update argument.
+  if (ShapeUtil::IsScalar(dynamic_update_slice->shape())) {
+    return ReplaceInstruction(dynamic_update_slice, update);
+  }
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function) {
   if (ShapeUtil::HasZeroElements(arg->shape()) ||
       ShapeUtil::HasZeroElements(reduce->shape())) {
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reduce,
         HloInstruction::CreateBroadcast(reduce->shape(), init_value, {}));
     return Status::OK();
@@ -1070,7 +1053,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(
     for (auto dim : dimensions) {
       new_reduce_dimensions.push_back(transpose_dimensions[dim]);
     }
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateReduce(
                     reduce->shape(), arg->mutable_operand(0), init_value,
                     new_reduce_dimensions, function));
@@ -1114,7 +1097,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(
           new_reduce_dimensions.push_back(i);
         }
       }
-      return computation_->ReplaceWithNewInstruction(
+      return ReplaceWithNewInstruction(
           reduce, HloInstruction::CreateReduce(
                       reduce->shape(), arg->mutable_operand(0), init_value,
                       new_reduce_dimensions, function));
@@ -1125,27 +1108,84 @@ Status AlgebraicSimplifierVisitor::HandleReduce(
       ShapeUtil::HasZeroElements(arg->shape())) {
     auto reshape = computation_->AddInstruction(
         HloInstruction::CreateReshape(reduce->shape(), arg));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateMap(reduce->shape(),
                                           {reshape, init_value}, function));
   }
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleReduceWindow(
+    HloInstruction* reduce_window, HloInstruction* operand,
+    const Window& window, HloComputation* function) {
+  VLOG(10) << "Considering folding Pad: " << operand->ToString()
+           << "\ninto reduce-window: " << reduce_window->ToString();
+
+  // This optimization folds a pad op into reduce_window.
+  if (operand->opcode() != HloOpcode::kPad) {
+    VLOG(10) << "Not folding pad into reduce-window as there is no pad.";
+    return Status::OK();
+  }
+
+  // Do not fold interior padding into ReduceWindow since the backends do not
+  // support it.
+  const PaddingConfig& pad_config = operand->padding_config();
+  if (HasInteriorPadding(pad_config)) {
+    VLOG(10) << "Not folding pad into reduce-window due to interior padding.";
+    return Status::OK();
+  }
+
+  // If reduce_window already has padding, the pad value of the pad op and the
+  // init value of reduce_window must match to allow folding the pad.
+  const HloInstruction* pad_value = operand->operand(1);
+  const HloInstruction* reduce_init_value = reduce_window->operand(1);
+  if (pad_value != reduce_init_value) {
+    // The pad value is usually a constant, so we handle that case and do not
+    // try to get more fancy about proving equivalence in cases beyond that.
+    if (pad_value->opcode() != HloOpcode::kConstant ||
+        reduce_init_value->opcode() != HloOpcode::kConstant ||
+        !LiteralUtil::Equal(pad_value->literal(),
+                            reduce_init_value->literal())) {
+      VLOG(10)
+          << "Not folding pad into reduce-window due to different pad values.";
+      return Status::OK();
+    }
+  }
+
+  // Carry out the folding of the pad into reduce_window.
+  VLOG(10) << "Folding pad into reduce-window.";
+  Window new_window = window;
+  const int64 rank = ShapeUtil::Rank(reduce_window->shape());
+  TF_RET_CHECK(pad_config.dimensions_size() == rank);
+  TF_RET_CHECK(window.dimensions_size() == rank);
+  for (int64 i = 0; i < rank; ++i) {
+    const auto& pad_dim = pad_config.dimensions(i);
+    auto& window_dim = *new_window.mutable_dimensions(i);
+    window_dim.set_padding_low(window_dim.padding_low() +
+                               pad_dim.edge_padding_low());
+    window_dim.set_padding_high(window_dim.padding_high() +
+                                pad_dim.edge_padding_high());
+  }
+  return ReplaceWithNewInstruction(
+      reduce_window, HloInstruction::CreateReduceWindow(
+                         /*shape=*/reduce_window->shape(),
+                         /*operand=*/operand->mutable_operand(0),
+                         /*init_value=*/reduce_window->mutable_operand(1),
+                         /*window=*/new_window,
+                         /*reduce_computation=*/function));
+}
+
 Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   auto operand = transpose->mutable_operand(0);
 
   if (std::is_sorted(transpose->dimensions().begin(),
                      transpose->dimensions().end())) {
     VLOG(10) << "deleting no-op transpose";
-    changed_ = true;
-    return computation_->ReplaceInstruction(transpose, operand);
+    return ReplaceInstruction(transpose, operand);
   }
 
   if (HloOpcode::kTranspose == operand->opcode()) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         transpose, HloInstruction::CreateTranspose(
                        transpose->shape(), operand->mutable_operand(0),
                        ComposePermutations(operand->dimensions(),
@@ -1272,9 +1312,7 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   auto new_rhs = add_bitcast(new_filter_shape, rhs);
   auto dot = computation_->AddInstruction(HloInstruction::CreateBinary(
       dot_output_shape, HloOpcode::kDot, new_lhs, new_rhs));
-  changed_ = true;
-  return computation_->ReplaceInstruction(convolution,
-                                          add_bitcast(convolution_shape, dot));
+  return ReplaceInstruction(convolution, add_bitcast(convolution_shape, dot));
 }
 
 bool AlgebraicSimplifierVisitor::TransformToClampIfSameShape(
@@ -1288,8 +1326,7 @@ bool AlgebraicSimplifierVisitor::TransformToClampIfSameShape(
 
   auto clamp = HloInstruction::CreateTernary(root->shape(), HloOpcode::kClamp,
                                              max_operand, operand, min_operand);
-  TF_CHECK_OK(computation_->ReplaceWithNewInstruction(root, std::move(clamp)));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceWithNewInstruction(root, std::move(clamp)));
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 3123ee4f8728a8d76b16bc4b3162962757d3b778..87d8a7165ccfad587474a0c89e9387597e341d8f 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -41,6 +41,7 @@ namespace {
 AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() {
   return [](const Shape&, const Shape&) { return true; };
 }
+
 AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
@@ -69,6 +70,52 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   EXPECT_EQ(root, param0);
 }
 
+TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+  HloInstruction* bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r2f32, zero, {0, 1}));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({0, 0, 0})));
+  HloInstruction* bcast =
+      builder.AddInstruction(HloInstruction::CreateBroadcast(r2f32, zero, {1}));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
 // Test that A - 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, SubZero) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
@@ -420,115 +467,108 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
   EXPECT_THAT(computation->root_instruction(), input);
 }
 
-TEST_F(AlgebraicSimplifierTest, ConvertF32ToS64) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
-
-  auto module = MakeUnique<HloModule>(TestName());
-  auto computation = module->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
-                computation->root_instruction()->literal()),
-            42);
-}
-
-TEST_F(AlgebraicSimplifierTest, ConvertS64ToF32) {
+// Test that copies are removed.
+TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42)));
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
   builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
+      HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
-                computation->root_instruction()->literal()),
-            42.0f);
+  EXPECT_THAT(computation->root_instruction(), param0);
 }
 
-TEST_F(AlgebraicSimplifierTest, ConvertF32ArrayToS64Array) {
+// Test that unary concatenates are removed.
+TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
   HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({42.0f, 19.0f})));
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
   builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
+      HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {0}),
-      42);
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {1}),
-      19);
+  EXPECT_THAT(computation->root_instruction(), param0);
 }
 
-// Test that copies are removed.
-TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
-  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+// Test that empty operands of concatenates are removed.
+TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
+  const int kParamLength = 100;
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32, "param0"));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r1f32, "param1"));
+  HloInstruction* empty_literal = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({})));
+  HloInstruction* empty_slice =
+      builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}));
+  Shape result_shape = ShapeUtil::MakeShape(F32, {3 * kParamLength});
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Concatenate(empty_literal, param0, param0, empty_slice, param1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), param0);
+  EXPECT_THAT(computation->root_instruction(),
+              op::Concatenate(param0, param0, param1));
 }
 
-// Test that unary concatenates are removed.
-TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
-  Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
+// Test a concatenate with only empty operands is removed.
+TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
+  const int kParamLength = 100;
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r1f32, "param0"));
-  builder.AddInstruction(
-      HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
+  HloInstruction* empty_literal = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({})));
+  HloInstruction* empty_slice =
+      builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}));
+  Shape result_shape = ShapeUtil::MakeShape(F32, {0});
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      result_shape, {empty_literal, empty_slice}, 0));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              op::Concatenate(empty_literal, empty_slice));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), param0);
+  EXPECT_EQ(computation->root_instruction(), empty_literal);
 }
 
 // Test that a simplification which changes layouts is not performed if layout
@@ -1508,6 +1548,86 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
 }
 
+// Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
+TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
+  HloModule module(TestName());
+  HloComputation::Builder builder(TestName());
+
+  // Create operand to the pad.
+  HloInstruction* operand =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 2, 3, 4}), "p0"));
+
+  // Create the pad.
+  PaddingConfig padding = MakeNoPaddingConfig(4);
+  padding.mutable_dimensions(1)->set_edge_padding_low(1);
+  padding.mutable_dimensions(3)->set_edge_padding_high(2);
+
+  HloInstruction* pad_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0f)));
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(F32, {1, 3, 3, 5}), operand, pad_value, padding));
+
+  // Create add computation.
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  // Create the reduce-window.
+  Window window;
+  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+    auto* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_padding_low(10);
+    dim->set_padding_high(100);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+  const Shape reduce_window_shape =
+      ShapeUtil::MakeShape(F32, {111, 113, 113, 115});
+  HloInstruction* reduce_init_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0f)));
+  HloInstruction* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          reduce_window_shape, pad, reduce_init_value, window,
+          add_computation));
+
+  // Build the computation and run the simplifier.
+  auto computation = module.AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root, reduce_window);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+
+  // Running simplification again should not result in any further changes.
+  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+
+  // Verify the result
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::ReduceWindow(operand, op::Constant()));
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
+      << ShapeUtil::HumanString(root->shape()) << " vs "
+      << ShapeUtil::HumanString(reduce_window_shape);
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(1).padding_low(), 11);
+  EXPECT_EQ(root->window().dimensions(2).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(3).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(1).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(2).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(3).padding_high(), 102);
+}
+
 TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   HloComputation::Builder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {448, 2048, 1, 1});
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 49a621810ef26f76494bab08d087bb4a07472000..83759a7a0c62222b81b82b8a0f8e0396a8f17eff 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -64,8 +64,9 @@ GlobalDataHandle AllocationTracker::RegisterInternal(
     auto& allocation = FindOrDie(handle_to_allocation_, handle);
     int ref_count = allocation->ref_count();
     CHECK_GT(ref_count, 0);
-    VLOG(2) << "ref_count: " << ref_count << " -> " << ref_count + 1;
-    allocation->increment_ref_count();
+    VLOG(2) << "ref_count: " << ref_count << " -> " <<
+            (ref_count + initial_ref_count);
+    allocation->increment_ref_count(initial_ref_count);
   } else {
     handle = next_handle_++;
     VLOG(2) << "ref_count: " << initial_ref_count;
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index e00768001620275d702c2f96a89d981526ea81a7..ebbf35b6fe87bc7322ccb99cfe8f8eed56de06b3 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -63,10 +63,10 @@ class Allocation {
     CHECK_GE(ref_count_, 0);
     return ref_count_;
   }
-  void increment_ref_count() {
+  void increment_ref_count(int inc) {
     CHECK_GT(ref_count_, 0);
-    CHECK_LT(ref_count_, INT_MAX);
-    ++ref_count_;
+    CHECK_LE(ref_count_, INT_MAX - inc);
+    ref_count_ += inc;
   }
   void decrement_ref_count() {
     CHECK_GT(ref_count_, 0);
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 5c05417c6dcb887b5352d1270c24a4eae62149e3..1913617fecf757a529bbdc803b4227a560c6e1cf 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -41,13 +41,39 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+BackendOptions& BackendOptions::set_platform(
+    perftools::gputools::Platform* platform) {
+  platform_ = platform;
+  return *this;
+}
+
+perftools::gputools::Platform* BackendOptions::platform() const {
+  return platform_;
+}
+
+BackendOptions& BackendOptions::set_number_of_replicas(int number_of_replicas) {
+  number_of_replicas_ = number_of_replicas;
+  return *this;
+}
+
+int BackendOptions::number_of_replicas() const { return number_of_replicas_; }
+
+BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
+    int num_threads) {
+  intra_op_parallelism_threads_ = num_threads;
+  return *this;
+}
+
+int BackendOptions::intra_op_parallelism_threads() const {
+  return intra_op_parallelism_threads_;
+}
+
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
 struct Backend::EigenThreadPoolWrapper {
-  explicit EigenThreadPoolWrapper()
-      : pool(new tensorflow::thread::ThreadPool(
-            tensorflow::Env::Default(), "XLAEigen",
-            tensorflow::port::NumSchedulableCPUs())),
+  explicit EigenThreadPoolWrapper(const int num_threads)
+      : pool(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(),
+                                                "XLAEigen", num_threads)),
         wrapper(new tensorflow::EigenThreadPoolWrapper(pool.get())),
         device(new Eigen::ThreadPoolDevice(wrapper.get(),
                                            wrapper->NumThreads())) {}
@@ -58,18 +84,21 @@ struct Backend::EigenThreadPoolWrapper {
 };
 
 /* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
-    perftools::gputools::Platform* platform, int64 replica_count) {
+    const BackendOptions& options) {
+  int64 replica_count = options.number_of_replicas();
   if (replica_count == -1) {
     legacy_flags::BackendFlags* flags = legacy_flags::GetBackendFlags();
     replica_count = flags->xla_replicas;
   }
+  perftools::gputools::Platform* platform = options.platform();
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto stream_executors,
                       PlatformUtil::GetStreamExecutors(platform));
   TF_ASSIGN_OR_RETURN(auto transfer_manager,
                       TransferManager::GetForPlatform(platform));
-  std::unique_ptr<Backend> backend(new Backend(
-      replica_count, platform, compiler, stream_executors, transfer_manager));
+  std::unique_ptr<Backend> backend(
+      new Backend(replica_count, platform, compiler, stream_executors,
+                  transfer_manager, options.intra_op_parallelism_threads()));
   TF_RETURN_IF_ERROR(backend->PoolStreams(kInitialStreamsToPool,
                                           backend->default_stream_executor()));
   return std::move(backend);
@@ -79,7 +108,9 @@ struct Backend::EigenThreadPoolWrapper {
 Backend::CreateDefaultBackend() {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
                       PlatformUtil::GetDefaultPlatform());
-  return CreateBackend(platform);
+  BackendOptions backend_options;
+  backend_options.set_platform(platform);
+  return CreateBackend(backend_options);
 }
 
 tensorflow::Status Backend::PoolStreams(int n, se::StreamExecutor* executor) {
@@ -114,7 +145,7 @@ Backend::Backend(
     int64 replica_count, perftools::gputools::Platform* platform,
     Compiler* compiler,
     tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
-    TransferManager* transfer_manager)
+    TransferManager* transfer_manager, int intra_op_parallelism_threads)
     : platform_(platform),
       compiler_(compiler),
       transfer_manager_(transfer_manager),
@@ -144,7 +175,11 @@ Backend::Backend(
     inter_op_thread_pool_.reset(new tensorflow::thread::ThreadPool(
         tensorflow::Env::Default(), "xla_inter_op",
         tensorflow::port::NumSchedulableCPUs()));
-    intra_op_thread_pool_wrapper_.reset(new EigenThreadPoolWrapper());
+    const int num_threads = intra_op_parallelism_threads > 0
+                                ? intra_op_parallelism_threads
+                                : tensorflow::port::NumSchedulableCPUs();
+    intra_op_thread_pool_wrapper_.reset(
+        new EigenThreadPoolWrapper(num_threads));
   }
 }
 
@@ -190,10 +225,17 @@ tensorflow::thread::ThreadPool* Backend::inter_op_thread_pool() const {
 
 const Eigen::ThreadPoolDevice* Backend::eigen_intra_op_thread_pool_device()
     const {
-  if (intra_op_thread_pool_wrapper_ == nullptr) return nullptr;
+  if (intra_op_thread_pool_wrapper_ == nullptr) {
+    return nullptr;
+  }
   return intra_op_thread_pool_wrapper_->device.get();
 }
 
+tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
+  if (intra_op_thread_pool_wrapper_ == nullptr) return nullptr;
+  return intra_op_thread_pool_wrapper_->pool.get();
+}
+
 StatusOr<perftools::gputools::StreamExecutor*> Backend::stream_executor(
     int device_ordinal) const {
   if (device_ordinal < 0 ||
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 9f6829b7d937cec6a67d4016a40506de5df8572d..1068bac2779e9a3dc6c23c0b9fbcc5403fcc2815 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -39,6 +39,31 @@ struct ThreadPoolDevice;
 
 namespace xla {
 
+// Options to configure the backend when it is created.
+class BackendOptions {
+ public:
+  // Set the platform backing the backend, or nullptr for the default platform.
+  BackendOptions& set_platform(perftools::gputools::Platform* platform);
+  perftools::gputools::Platform* platform() const;
+
+  // Set the number of replicas to use when compiling replicated
+  // programs. The default is -1 meaning that the value is read from
+  // the xla_replicas flag.
+  BackendOptions& set_number_of_replicas(int number_of_replicas);
+  int number_of_replicas() const;
+
+  // Sets the thread pool size for parallel execution of an individual operator.
+  // The default value of -1 will result in initializing the thread pool with
+  // the number of threads equal to the number of cores in the system.
+  BackendOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
+ private:
+  perftools::gputools::Platform* platform_ = nullptr;
+  int number_of_replicas_ = -1;
+  int intra_op_parallelism_threads_ = -1;
+};
+
 // Class which encapsulates an XLA backend. It includes everything necessary
 // to compile and execute computations on a particular platform.
 //
@@ -53,9 +78,9 @@ class Backend {
   static constexpr int kInitialStreamsToPool = 8;
 
   // Creates a new backend for the given platform with the given number of
-  // replicas. A value of -1 means to use the flag value.
+  // replicas.
   static StatusOr<std::unique_ptr<Backend>> CreateBackend(
-      perftools::gputools::Platform* platform, int64 replica_count = -1);
+      const BackendOptions& options);
 
   // Creates a backend for the default platform. The default platform is defined
   // in PlatformUtil.
@@ -150,6 +175,7 @@ class Backend {
   // For the host platform, returns the configured eigen threadpool device to be
   // used for scheduling work. For other platforms, returns NULL.
   const Eigen::ThreadPoolDevice* eigen_intra_op_thread_pool_device() const;
+  tensorflow::thread::ThreadPool* eigen_intra_op_thread_pool() const;
 
   // Resets the devices associated with this backend.
   Status ResetDevices();
@@ -160,7 +186,7 @@ class Backend {
           Compiler* compiler,
           tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
               stream_executors,
-          TransferManager* transfer_manager);
+          TransferManager* transfer_manager, int intra_op_parallelism_threads);
   Backend(const Backend&) = delete;
   Backend& operator=(const Backend&) = delete;
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 6efa73b1211da9d41c502818a0bc570fa7773fc6..47560fefea855fa7f70ef6268252a9b6d9964f76 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -488,11 +488,9 @@ Status GatherComputationsByAllocationType(
 /* static */
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    LogicalBuffer::SizeFunction buffer_size, int64 alignment,
-    const std::vector<const HloInstruction*>* hlos_to_allocate) {
+    LogicalBuffer::SizeFunction buffer_size, int64 alignment) {
   BufferAssigner assigner(std::move(buffer_size), alignment);
-  return assigner.CreateAssignment(module, std::move(hlo_ordering),
-                                   hlos_to_allocate);
+  return assigner.CreateAssignment(module, std::move(hlo_ordering));
 }
 
 bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
@@ -545,24 +543,22 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
 
 Status BufferAssigner::AssignBuffersForComputation(
     const HloComputation* computation, bool is_thread_local,
-    const FlatSet<const HloInstruction*>* hlos_to_allocate,
     const FlatSet<const LogicalBuffer*>& colocated_buffers,
     const FlatSet<BufferAllocation::Index>& colocated_allocations,
+    FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>*
+        buffers_to_assign_sequentially,
     BufferAssignment* assignment) {
   // Buffers are sorted and assigned to BufferAllocations in decreasing order of
   // size.
   std::vector<const LogicalBuffer*> sorted_buffers;
   for (auto& instruction : computation->instructions()) {
-    if (hlos_to_allocate == nullptr ||
-        hlos_to_allocate->count(instruction.get()) > 0) {
-      // Add all buffers which this instruction defines. Instruction which don't
-      // define buffers (eg, bitcast which just forwards a pointer) don't need
-      // any allocations.
-      for (const LogicalBuffer* buffer :
-           assignment->points_to_analysis().GetBuffersDefinedByInstruction(
-               instruction.get())) {
-        sorted_buffers.push_back(buffer);
-      }
+    // Add all buffers which this instruction defines. Instruction which don't
+    // define buffers (eg, bitcast which just forwards a pointer) don't need
+    // any allocations.
+    for (const LogicalBuffer* buffer :
+         assignment->points_to_analysis().GetBuffersDefinedByInstruction(
+             instruction.get())) {
+      sorted_buffers.push_back(buffer);
     }
   }
 
@@ -578,9 +574,16 @@ Status BufferAssigner::AssignBuffersForComputation(
   // If there is a sequential instruction ordering, we'll delay assignment of
   // temp buffers until after the main assignment loop.
   const BufferLiveness& liveness = assignment->liveness();
-  const std::vector<const HloInstruction*>* sequential_order =
-      liveness.hlo_ordering().SequentialOrder(*computation);
-  FlatSet<const LogicalBuffer*> unassigned_temp_buffers;
+  const bool has_sequential_order =
+      liveness.hlo_ordering().SequentialOrder(*computation) != nullptr;
+  if (has_sequential_order && buffers_to_assign_sequentially != nullptr) {
+    // Every sequential computation must get an entry in the
+    // buffers_to_assign_sequentially map, even if we end up with an empty set
+    // of buffers. This ensures we can correctly determine whether to run
+    // whole-module heap simulation.
+    buffers_to_assign_sequentially->emplace(computation,
+                                            FlatSet<const LogicalBuffer*>());
+  }
 
   // Sort the LogicalBuffers first by size. We assign the larger LogicalBuffers
   // first for simplicity. This means any previously created BufferAllocation is
@@ -599,7 +602,7 @@ Status BufferAssigner::AssignBuffersForComputation(
   // important reuse case where an elementwise instruction reuses one of its
   // operand's buffer. This improves locality.
   std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [this, sequential_order, &liveness, &post_order_position](
+            [this, has_sequential_order, &liveness, &post_order_position](
                 const LogicalBuffer* a, const LogicalBuffer* b) {
               // Primary sort is by decreasing buffer size.
               const int64 a_size = buffer_size_(*a);
@@ -609,7 +612,7 @@ Status BufferAssigner::AssignBuffersForComputation(
               }
               // Otherwise live out buffers come before others, if the
               // instructions are sequentially ordered.
-              if (sequential_order != nullptr) {
+              if (has_sequential_order) {
                 const bool a_live_out = liveness.MaybeLiveOut(*a);
                 const bool b_live_out = liveness.MaybeLiveOut(*b);
                 if (a_live_out != b_live_out) {
@@ -746,7 +749,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       }
     }
 
-    if (!assignment->HasAllocation(*buffer) && sequential_order != nullptr &&
+    if (!assignment->HasAllocation(*buffer) && has_sequential_order &&
         !liveness.MaybeLiveOut(*buffer)) {
       // There is a sequential instruction ordering, so we delay assignment of
       // temp buffers until after the loop. We do this right before we decide to
@@ -758,7 +761,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       // for the definition of temp buffers.
       CHECK(!is_entry_parameter) << *buffer;
       CHECK(!is_thread_local) << *buffer;
-      unassigned_temp_buffers.insert(buffer);
+      (*buffers_to_assign_sequentially)[computation].insert(buffer);
       VLOG(3) << "Delaying assignment of temp buffer: " << *buffer;
       continue;
     }
@@ -772,27 +775,68 @@ Status BufferAssigner::AssignBuffersForComputation(
     }
   }
 
-  if (!unassigned_temp_buffers.empty()) {
-    TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
-        *sequential_order, unassigned_temp_buffers, *computation, assignment));
-  }
   return Status::OK();
 }
 
 Status BufferAssigner::AssignBuffersWithSequentialOrdering(
-    const std::vector<const HloInstruction*>& sequence,
-    const FlatSet<const LogicalBuffer*>& buffers_to_assign,
-    const HloComputation& computation, BufferAssignment* assignment) {
+    const FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>&
+        buffers_to_assign_sequentially,
+    bool run_whole_module_heap_simulation, BufferAssignment* assignment) {
   // Run the sequence of instructions through the heap simulator.  The heuristic
   // that seems to give the best results is lazy-best-fit, with all runs of
   // alloc / free calls sorted in decreasing size order.
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
-                             MakeUnique<LazyBestFitHeap>(alignment_)),
-                         sequence, computation,
-                         assignment->points_to_analysis(), buffer_size_,
-                         &buffers_to_assign));
+  const HloOrdering& hlo_ordering = assignment->liveness().hlo_ordering();
+  if (run_whole_module_heap_simulation) {
+    // Run the heap simulation over the whole module. This reduces memory usage,
+    // since buffers for kCall and kWhile sub-computations are only live for the
+    // duration of their calling instructions.
+    VLOG(1) << "Running whole-module heap simulation";
+    SequentialHloOrdering::HloModuleSequence module_sequence;
+    FlatSet<const LogicalBuffer*> all_buffers_to_assign;
+    for (const auto& pair : buffers_to_assign_sequentially) {
+      const HloComputation* computation = pair.first;
+      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const std::vector<const HloInstruction*>* instruction_sequence =
+          hlo_ordering.SequentialOrder(*computation);
+      CHECK(instruction_sequence != nullptr) << computation->name();
+      module_sequence[computation] = *instruction_sequence;
+      all_buffers_to_assign.insert(buffers_to_assign.begin(),
+                                   buffers_to_assign.end());
+    }
+    TF_ASSIGN_OR_RETURN(
+        const HeapSimulator::Result result,
+        HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                               MakeUnique<LazyBestFitHeap>(alignment_)),
+                           assignment->module(), module_sequence,
+                           assignment->points_to_analysis(), buffer_size_,
+                           &all_buffers_to_assign));
+    AssignBuffersFromHeapSimulator(result, assignment);
+  } else {
+    // Run the heap-simulation on a per-computation basis. Buffers for
+    // sub-computations are assigned disjoint BufferAllocations, assuming the
+    // worst-case that they may all be live concurrently.
+    VLOG(1) << "Running per-computation heap simulation";
+    for (const auto& pair : buffers_to_assign_sequentially) {
+      const HloComputation* computation = pair.first;
+      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const std::vector<const HloInstruction*>* instruction_sequence =
+          hlo_ordering.SequentialOrder(*computation);
+      CHECK(instruction_sequence != nullptr) << computation->name();
+      TF_ASSIGN_OR_RETURN(
+          const HeapSimulator::Result result,
+          HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                                 MakeUnique<LazyBestFitHeap>(alignment_)),
+                             *computation, *instruction_sequence,
+                             assignment->points_to_analysis(), buffer_size_,
+                             &buffers_to_assign));
+      AssignBuffersFromHeapSimulator(result, assignment);
+    }
+  }
+  return Status::OK();
+}
+
+void BufferAssigner::AssignBuffersFromHeapSimulator(
+    const HeapSimulator::Result& result, BufferAssignment* assignment) {
   if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) {
     assignment->stats_.preallocated_temp_fragmentation_bytes =
         result.fragmentation_size;
@@ -801,8 +845,6 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         result.fragmentation_size;
   }
 
-  // Use the results of the heap simulator to create one allocation per
-  // computation, with LogicalBuffers packed to specific offsets.
   BufferAllocation* allocation = assignment->NewEmptyAllocation(
       result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true);
   for (const auto& buffer_chunk : result.chunk_map) {
@@ -810,7 +852,6 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     const HeapSimulator::Chunk& chunk = buffer_chunk.second;
     assignment->AddAssignment(allocation, buffer, chunk.offset, chunk.size);
   }
-  return Status::OK();
 }
 
 // Adds the 'colocated_set' of buffers to 'colocated_buffer_sets', maintaining
@@ -1103,35 +1144,15 @@ void BufferAssigner::AssignColocatedBufferSets(
 }
 
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
-    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    const std::vector<const HloInstruction*>* hlos_to_allocate) {
+    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferLiveness> liveness,
                       BufferLiveness::Run(module, std::move(hlo_ordering)));
 
-  std::vector<const HloComputation*> thread_local_computations;
-  std::vector<const HloComputation*> global_computations;
   VLOG(1) << "Assigning buffers to module " << module->name();
-  if (hlos_to_allocate != nullptr) {
-    VLOG(3) << "LogicalBuffer assignment restricted to hlos: ";
-    for (auto hlo : *hlos_to_allocate) {
-      VLOG(3) << "  " << hlo->parent()->name() << "::" << hlo->name();
-    }
-  }
-  XLA_VLOG_LINES(3, module->ToString());
+  XLA_VLOG_LINES(2, module->ToString());
   XLA_VLOG_LINES(3, liveness->ToString());
   XLA_VLOG_LINES(3, liveness->points_to_analysis().ToString());
 
-  TF_RETURN_IF_ERROR(GatherComputationsByAllocationType(
-      module, &thread_local_computations, &global_computations));
-
-  // Set of HLO's to allocate if hlos_to_allocate is given. Passed as a set to
-  // AssignBuffersForComputation for fast membership testing.
-  std::unique_ptr<FlatSet<const HloInstruction*>> hlo_set;
-  if (hlos_to_allocate != nullptr) {
-    hlo_set = MakeUnique<FlatSet<const HloInstruction*>>(
-        hlos_to_allocate->begin(), hlos_to_allocate->end());
-  }
-
   // Can't use MakeUnique because BufferAssignment constructor is private.
   std::unique_ptr<BufferAssignment> assignment(
       new BufferAssignment(module, std::move(liveness), alignment_));
@@ -1148,16 +1169,38 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   AssignColocatedBufferSets(colocated_buffer_sets, assignment.get(),
                             &colocated_buffers, &colocated_allocations);
 
+  std::vector<const HloComputation*> thread_local_computations;
+  std::vector<const HloComputation*> global_computations;
+  TF_RETURN_IF_ERROR(GatherComputationsByAllocationType(
+      module, &thread_local_computations, &global_computations));
+
+  // First assign buffers for global computatations. Temporary buffers for
+  // sequential computations are collected in 'buffers_to_assign_sequentially'.
+  FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>
+      buffers_to_assign_sequentially;
   for (auto* computation : global_computations) {
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, /*is_thread_local=*/false, hlo_set.get(),
-        colocated_buffers, colocated_allocations, assignment.get()));
+        computation, /*is_thread_local=*/false, colocated_buffers,
+        colocated_allocations, &buffers_to_assign_sequentially,
+        assignment.get()));
   }
+  // Assign buffers with sequential ordering, if any. If all global computations
+  // are sequential, we can run heap simuation on the whole module, which
+  // reduces memory usage.
+  const bool run_whole_module_heap_simulation =
+      buffers_to_assign_sequentially.size() == global_computations.size();
+  TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
+      buffers_to_assign_sequentially, run_whole_module_heap_simulation,
+      assignment.get()));
+
+  // Now assign buffers for thread-local computations. All LogicalBuffers get
+  // their own BufferAllocation.
   for (auto* computation : thread_local_computations) {
     TF_RET_CHECK(computation != module->entry_computation());
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, /*is_thread_local=*/true, hlo_set.get(), colocated_buffers,
-        colocated_allocations, assignment.get()));
+        computation, /*is_thread_local=*/true, colocated_buffers,
+        colocated_allocations, /*buffers_to_assign_sequentially=*/nullptr,
+        assignment.get()));
   }
 
   // Mark all buffers which may be live out of the entry computation as
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 34667c435d5448ab2a518733516e4a5140fb3dc4..9774a3174acfc7dcf219532a3c0eae22ad5f743c 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -354,6 +355,9 @@ class BufferAssignment {
   void AddAssignment(BufferAllocation* allocation, const LogicalBuffer& buffer,
                      int64 offset, int64 size);
 
+  // Returns the HloModule used to construct this assignment.
+  const HloModule& module() { return *module_; }
+
   // Returns the BufferLiveness object used to construct this assignment.
   const BufferLiveness& liveness() { return *liveness_; }
 
@@ -396,13 +400,10 @@ class BufferAssigner {
   // Build and return a BufferAssignment for the given module. The given
   // HloOrdering is used to determine buffer liveness. buffer_size is a function
   // which returns the size of a LogicalBuffer. Alignment is the the minimum
-  // alignment of any buffer. If hlos_to_allocate is not null then only
-  // instructions in this vector are considered for buffer assignment. If
-  // hlos_to_allocate is null then all instructions are considered.
+  // alignment of any buffer.
   static StatusOr<std::unique_ptr<BufferAssignment>> Run(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      LogicalBuffer::SizeFunction buffer_size, int64 alignment,
-      const std::vector<const HloInstruction*>* hlos_to_allocate = nullptr);
+      LogicalBuffer::SizeFunction buffer_size, int64 alignment);
 
  private:
   explicit BufferAssigner(LogicalBuffer::SizeFunction buffer_size,
@@ -412,29 +413,38 @@ class BufferAssigner {
 
   // Create a buffer assignment.
   StatusOr<std::unique_ptr<BufferAssignment>> CreateAssignment(
-      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      const std::vector<const HloInstruction*>* hlos_to_allocate = nullptr);
+      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering);
 
   // Assigns buffers to the instructions in the given computation. "assignment"
   // is modified to reflect the new buffer assignments. If is_thread_local is
   // true, then all assigned buffers have the is_thread_local flag set to
-  // true. If hlos_to_allocate is not null it indicates which HLOs to include in
-  // buffer assignment. If null, all instructions in the computation are
-  // included.
+  // true.
   Status AssignBuffersForComputation(
       const HloComputation* computation, bool is_thread_local,
-      const tensorflow::gtl::FlatSet<const HloInstruction*>* hlos_to_allocate,
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
       const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
           colocated_allocations,
+      tensorflow::gtl::FlatMap<const HloComputation*,
+                               tensorflow::gtl::FlatSet<const LogicalBuffer*>>*
+          buffers_to_assign_sequentially,
       BufferAssignment* assignment);
 
-  // Assigns 'buffers_to_assign' assuming the HLO instructions will be executed
-  // in the given 'sequential_order'.
+  // Assigns 'buffers_to_assign_sequentially' using heap simulation, assuming
+  // the HLO instructions will be executed in the sequential order given by
+  // assignment->liveness().hlo_ordering().SequentialOrder. If
+  // 'run_whole_module_heap_simulation' is true, the heap simulation will be run
+  // assuming all global computations are sequentially ordered.
   Status AssignBuffersWithSequentialOrdering(
-      const std::vector<const HloInstruction*>& sequential_order,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers_to_assign,
-      const HloComputation& computation, BufferAssignment* assignment);
+      const tensorflow::gtl::FlatMap<
+          const HloComputation*,
+          tensorflow::gtl::FlatSet<const LogicalBuffer*>>&
+          buffers_to_assign_sequentially,
+      bool run_whole_module_heap_simulation, BufferAssignment* assignment);
+
+  // Uses the results of the heap simulator to create a single allocation, with
+  // LogicalBuffers packed to specific offsets.
+  void AssignBuffersFromHeapSimulator(const HeapSimulator::Result& result,
+                                      BufferAssignment* assignment);
 
   // Tries to assign the given instruction to the given buffer. Returns if the
   // assignment was successful.
@@ -477,8 +487,6 @@ class BufferAssigner {
       const HloComputation& computation, const BufferLiveness& buffer_liveness,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
-  const HloModule* module_;
-
   // Function which returns the buffer size for a given logical buffer (shape).
   LogicalBuffer::SizeFunction buffer_size_;
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 0d6e89c5c6a4fbe2c7ed1acabcd743939faedc3a..ac1d769010c55ee4430554abe3205391bee5ebf1 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -856,8 +856,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
   EXPECT_FALSE(map_root_alloc.maybe_live_out());
   EXPECT_TRUE(map_root_alloc.is_thread_local());
 
-  // Allocations for the call computation should not be thread-local and not
-  // live-out.
+  // Allocations for the call computation should not be thread-local.
   auto& call_param_alloc = GetTopLevelAllocation(*assignment, call_param);
   EXPECT_FALSE(call_param_alloc.is_entry_computation_parameter());
   EXPECT_FALSE(call_param_alloc.maybe_live_out());
@@ -865,7 +864,6 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
 
   auto& call_root_alloc = GetTopLevelAllocation(*assignment, call_root);
   EXPECT_FALSE(call_root_alloc.is_entry_computation_parameter());
-  EXPECT_FALSE(call_root_alloc.maybe_live_out());
   EXPECT_FALSE(call_root_alloc.is_thread_local());
 
   // Entry computation allocations can be marked liveout and
@@ -1445,8 +1443,7 @@ TEST_F(BufferAssignmentTest, TwoCalls) {
     FlattenCallGraph flatten;
     TF_ASSIGN_OR_ASSERT_OK(bool result, flatten.Run(module.get()));
     EXPECT_TRUE(result);
-    TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                           CallGraph::Build(module.get()));
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   }
 
   RunCopyInsertion(module.get());
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index 38c2c8155186877355920042c63b52bf7192c1f6..3be4810490561808df2b34e341cfbd04928f8585 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -45,9 +45,7 @@ StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run(
 }
 
 tensorflow::Status BufferLiveness::Analyze() {
-  TF_ASSIGN_OR_RETURN(points_to_analysis_,
-                      TuplePointsToAnalysis::Run(
-                          module_, /*include_loop_fusion_instructions=*/true));
+  TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_));
   for (auto& computation : module_->computations()) {
     // Gather all instructions whose buffers might alias other instructions into
     // the set aliased_buffers_.  This includes those contained as a tuple
@@ -117,11 +115,7 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
 
   // If 'b' is a user of 'a' then the buffers interfere unless 'a.instruction'
   // and 'b.instruction' emit the same shape/layout, and 'b.instruction' meets
-  // one of following qualifications:
-  // *) Is element-wise.
-  // *) Is a loop fusion instruction (with DynamicUpdateSlice fused root) where
-  //    the singleton use of 'a' at 'a.index' is the fused root at operand 0.
-  // *) Use of 'operand' is DynamicUpdateSlice at operand index 0.
+  // the qualifications specified in CanShareOperandBufferWithUser.
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
     if (b.instruction()->IsUserOf(alias.instruction()) &&
         !CanShareOperandBufferWithUser(alias.instruction(), alias.index(),
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 57d69f5b71b336aba5bb9a8105b66ae5a5baa50a..fa7b2a309525dd80d655e10474c5d49f9da14ea8 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -98,12 +98,12 @@ void CallGraphNode::AddCallerCallSite(const CallSite& caller_callsite) {
   }
 }
 
-Status CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
-  TF_RET_CHECK(instruction->parent() == computation());
+void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
+  CHECK_EQ(instruction->parent(), computation());
   const CallContext context = GetInstructionCallContext(instruction);
   if (!instruction->called_computations().empty()) {
-    TF_RET_CHECK(context == CallContext::kSequential ||
-                 context == CallContext::kParallel);
+    CHECK(context == CallContext::kSequential ||
+          context == CallContext::kParallel);
     callsite_instructions_.insert({instruction, callsites_.size()});
     callsites_.push_back(
         CallSite(instruction, instruction->called_computations(), context));
@@ -116,22 +116,21 @@ Status CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
       }
     }
   }
-  return Status::OK();
 }
 
 CallGraph::CallGraph(const HloModule* module) : module_(module) {}
 
-StatusOr<const CallGraphNode*> CallGraph::GetNode(
+const CallGraphNode& CallGraph::GetNode(
     const HloComputation* computation) const {
   auto it = node_indices_.find(computation);
-  TF_RET_CHECK(it != node_indices_.end());
-  return &nodes_[it->second];
+  CHECK(it != node_indices_.end());
+  return nodes_[it->second];
 }
 
-StatusOr<CallGraphNode*> CallGraph::GetNode(const HloComputation* computation) {
+CallGraphNode& CallGraph::GetNode(const HloComputation* computation) {
   auto it = node_indices_.find(computation);
-  TF_RET_CHECK(it != node_indices_.end());
-  return &nodes_[it->second];
+  CHECK(it != node_indices_.end());
+  return nodes_[it->second];
 }
 
 namespace {
@@ -154,17 +153,17 @@ CallContext UnionContexts(CallContext a, CallContext b) {
 
 }  // namespace
 
-Status CallGraph::SetCallContexts() {
+void CallGraph::SetCallContexts() {
   std::queue<CallGraphNode*> worklist;
 
   // Initialize worklist with all roots of the call graph (computations without
   // callers).
   for (const std::unique_ptr<HloComputation>& computation :
        module_->computations()) {
-    TF_ASSIGN_OR_RETURN(CallGraphNode * node, GetNode(computation.get()));
-    if (node->callers().empty()) {
-      node->set_context(CallContext::kSequential);
-      worklist.push(node);
+    CallGraphNode& node = GetNode(computation.get());
+    if (node.callers().empty()) {
+      node.set_context(CallContext::kSequential);
+      worklist.push(&node);
     }
   }
 
@@ -174,7 +173,7 @@ Status CallGraph::SetCallContexts() {
 
     for (const CallSite& callsite : node->callsites()) {
       for (const HloComputation* callee : callsite.called_computations()) {
-        TF_ASSIGN_OR_RETURN(CallGraphNode * callee_node, GetNode(callee));
+        CallGraphNode& callee_node = GetNode(callee);
 
         // Update context of callee computation based on the callsite and its
         // current context.
@@ -182,16 +181,16 @@ Status CallGraph::SetCallContexts() {
         if (callsite.context() == CallContext::kParallel) {
           context_to_add = CallContext::kParallel;
         } else {
-          TF_RET_CHECK(callsite.context() == CallContext::kSequential);
+          CHECK_EQ(callsite.context(), CallContext::kSequential);
           context_to_add = node->context();
         }
         CallContext new_context =
-            UnionContexts(context_to_add, callee_node->context());
+            UnionContexts(context_to_add, callee_node.context());
 
-        if (new_context != callee_node->context()) {
+        if (new_context != callee_node.context()) {
           // Context of computation has been changed so add node to worklist.
-          callee_node->set_context(new_context);
-          worklist.push(callee_node);
+          callee_node.set_context(new_context);
+          worklist.push(&callee_node);
         }
       }
     }
@@ -200,14 +199,12 @@ Status CallGraph::SetCallContexts() {
   // No node should have a kNone calling context.
   for (const std::unique_ptr<HloComputation>& computation :
        module_->computations()) {
-    TF_ASSIGN_OR_RETURN(CallGraphNode * node, GetNode(computation.get()));
-    TF_RET_CHECK(node->context() != CallContext::kNone);
+    CHECK_NE(GetNode(computation.get()).context(), CallContext::kNone);
   }
-  return Status::OK();
 }
 
 /* static */
-StatusOr<std::unique_ptr<CallGraph>> CallGraph::Build(const HloModule* module) {
+std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
   // Constructor for CallGraph is private so MakeUnique can't be used.
   auto call_graph = WrapUnique<CallGraph>(new CallGraph(module));
 
@@ -221,54 +218,49 @@ StatusOr<std::unique_ptr<CallGraph>> CallGraph::Build(const HloModule* module) {
         {computation.get(), call_graph->nodes_.size()});
     // All computations should be unique, so the computation should not already
     // exist in the map.
-    TF_RET_CHECK(it_added.second);
+    CHECK(it_added.second);
     call_graph->nodes_.emplace_back(computation.get());
 
     // Add all callsites in this computation.
     for (const std::unique_ptr<HloInstruction>& instruction :
          computation->instructions()) {
-      TF_RETURN_IF_ERROR(call_graph->nodes_.back().AddCallSiteForInstruction(
-          instruction.get()));
+      call_graph->nodes_.back().AddCallSiteForInstruction(instruction.get());
     }
   }
 
   // Add caller callsites to each node.
   for (const std::unique_ptr<HloComputation>& computation :
        module->computations()) {
-    TF_ASSIGN_OR_RETURN(CallGraphNode * caller_node,
-                        call_graph->GetNode(computation.get()));
-    for (const CallSite& callsite : caller_node->callsites()) {
+    for (const CallSite& callsite :
+         call_graph->GetNode(computation.get()).callsites()) {
       for (auto* callee : callsite.called_computations()) {
         // Add caller callsites.
-        TF_ASSIGN_OR_RETURN(CallGraphNode * callee_node,
-                            call_graph->GetNode(callee));
-        callee_node->AddCallerCallSite(callsite);
+        call_graph->GetNode(callee).AddCallerCallSite(callsite);
       }
     }
   }
 
-  TF_RETURN_IF_ERROR(call_graph->SetCallContexts());
-
+  call_graph->SetCallContexts();
   XLA_VLOG_LINES(1, call_graph->ToString());
 
-  return std::move(call_graph);
+  return call_graph;
 }
 
 Status CallGraph::VisitNodesInternal(
-    const VisitorFunction& visitor_func, const CallGraphNode* node,
+    const VisitorFunction& visitor_func, const CallGraphNode& node,
     tensorflow::gtl::FlatSet<const CallGraphNode*>* visited) const {
-  auto pair = visited->insert(node);
+  auto pair = visited->insert(&node);
   if (!pair.second) {
     // Node was not inserted. Node has already been visited.
     return Status::OK();
   }
 
-  for (const HloComputation* computation : node->callees()) {
-    TF_ASSIGN_OR_RETURN(const CallGraphNode* callee_node, GetNode(computation));
-    TF_RETURN_IF_ERROR(VisitNodesInternal(visitor_func, callee_node, visited));
+  for (const HloComputation* computation : node.callees()) {
+    TF_RETURN_IF_ERROR(
+        VisitNodesInternal(visitor_func, GetNode(computation), visited));
   }
 
-  return visitor_func(*node);
+  return visitor_func(node);
 }
 
 Status CallGraph::VisitNodes(const VisitorFunction& visitor_func,
@@ -278,14 +270,13 @@ Status CallGraph::VisitNodes(const VisitorFunction& visitor_func,
     // Traverse from all roots in the call graph.
     for (const CallGraphNode& node : nodes()) {
       if (node.callers().empty()) {
-        TF_RETURN_IF_ERROR(VisitNodesInternal(visitor_func, &node, &visited));
+        TF_RETURN_IF_ERROR(VisitNodesInternal(visitor_func, node, &visited));
       }
     }
   } else {
     // Traverse only from the entry computation.
-    TF_ASSIGN_OR_RETURN(const CallGraphNode* entry_node,
-                        GetNode(module_->entry_computation()));
-    TF_RETURN_IF_ERROR(VisitNodesInternal(visitor_func, entry_node, &visited));
+    TF_RETURN_IF_ERROR(VisitNodesInternal(
+        visitor_func, GetNode(module_->entry_computation()), &visited));
   }
 
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 62d12f8f91b099b452143c98427fdd1e6867ac7d..7f9990f06d4fee4c52fa516fc2f6031f5dab2bb9 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
@@ -138,7 +137,7 @@ class CallGraphNode {
   // If instruction calls any computations adds a call site for this instruction
   // to the call graph node. If the instruction calls no computations then no
   // call site is added.
-  Status AddCallSiteForInstruction(HloInstruction* instruction);
+  void AddCallSiteForInstruction(HloInstruction* instruction);
 
   // Computation represented by this call graph node.
   HloComputation* computation_;
@@ -174,12 +173,11 @@ class CallGraph {
   using VisitorFunction = std::function<Status(const CallGraphNode&)>;
 
   // Builds and returns a call graph for the given HLO module.
-  static StatusOr<std::unique_ptr<CallGraph>> Build(const HloModule* module);
+  static std::unique_ptr<CallGraph> Build(const HloModule* module);
 
   // Returns the node associated with the given computation.
-  StatusOr<const CallGraphNode*> GetNode(
-      const HloComputation* computation) const;
-  StatusOr<CallGraphNode*> GetNode(const HloComputation* computation);
+  const CallGraphNode& GetNode(const HloComputation* computation) const;
+  CallGraphNode& GetNode(const HloComputation* computation);
 
   // Returns the vector of all nodes in the call graph.
   const std::vector<CallGraphNode>& nodes() const { return nodes_; }
@@ -197,14 +195,14 @@ class CallGraph {
   CallGraph(const HloModule* module);
 
   // Sets the call contexts for every node in the graph.
-  Status SetCallContexts();
+  void SetCallContexts();
 
   // Helper method for VisitNodes(). Traverses the call graph from 'node' in DFS
   // post order (callee before caller) calling visitor_func on each node. Adds
   // nodes to 'visited' as each node is visited. Skips nodes already in
   // 'visited'.
   Status VisitNodesInternal(
-      const VisitorFunction& visitor_func, const CallGraphNode* node,
+      const VisitorFunction& visitor_func, const CallGraphNode& node,
       tensorflow::gtl::FlatSet<const CallGraphNode*>* visited) const;
 
   // The HLO module represented by this call graph.
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index f71a5d01afa20e6c4e86ad8ef7a3a68c5e23e210..ab0ea47d024d871be88bfcab957810deb1ecac99 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -95,17 +95,15 @@ TEST_F(CallGraphTest, SingletonComputation) {
   HloModule module(TestName());
   HloComputation* computation =
       module.AddEntryComputation(MakeScalarComputation());
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(1, call_graph->nodes().size());
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* node,
-                         call_graph->GetNode(computation));
-  EXPECT_EQ(computation, node->computation());
-  EXPECT_TRUE(node->callsites().empty());
-  EXPECT_TRUE(node->callees().empty());
-  EXPECT_TRUE(node->caller_callsites().empty());
-  EXPECT_TRUE(node->callers().empty());
-  EXPECT_EQ(CallContext::kSequential, node->context());
+  const CallGraphNode& node = call_graph->GetNode(computation);
+  EXPECT_EQ(computation, node.computation());
+  EXPECT_TRUE(node.callsites().empty());
+  EXPECT_TRUE(node.callees().empty());
+  EXPECT_TRUE(node.caller_callsites().empty());
+  EXPECT_TRUE(node.callers().empty());
+  EXPECT_EQ(CallContext::kSequential, node.context());
 }
 
 TEST_F(CallGraphTest, UnreachableComputation) {
@@ -117,19 +115,17 @@ TEST_F(CallGraphTest, UnreachableComputation) {
   HloComputation* unreachable_computation =
       module.AddEmbeddedComputation(MakeScalarComputation());
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(2, call_graph->nodes().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  EXPECT_EQ(entry_computation, entry_node->computation());
-  EXPECT_EQ(CallContext::kSequential, entry_node->context());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* unreachable_node,
-                         call_graph->GetNode(unreachable_computation));
-  EXPECT_EQ(unreachable_computation, unreachable_node->computation());
-  EXPECT_EQ(CallContext::kSequential, unreachable_node->context());
+  const CallGraphNode& unreachable_node =
+      call_graph->GetNode(unreachable_computation);
+  EXPECT_EQ(unreachable_computation, unreachable_node.computation());
+  EXPECT_EQ(CallContext::kSequential, unreachable_node.context());
 }
 
 TEST_F(CallGraphTest, ParallelComputation) {
@@ -141,27 +137,24 @@ TEST_F(CallGraphTest, ParallelComputation) {
   HloComputation* entry_computation = module.AddEntryComputation(
       MakeMappingComputation(map_computation, /*callsites=*/5));
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(2, call_graph->nodes().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  EXPECT_EQ(entry_computation, entry_node->computation());
-  EXPECT_EQ(CallContext::kSequential, entry_node->context());
-  EXPECT_EQ(5, entry_node->callsites().size());
-  EXPECT_EQ(1, entry_node->callees().size());
-  EXPECT_TRUE(entry_node->caller_callsites().empty());
-  EXPECT_TRUE(entry_node->callers().empty());
-
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* map_node,
-                         call_graph->GetNode(map_computation));
-  EXPECT_EQ(map_computation, map_node->computation());
-  EXPECT_EQ(CallContext::kParallel, map_node->context());
-  EXPECT_TRUE(map_node->callsites().empty());
-  EXPECT_TRUE(map_node->callees().empty());
-  EXPECT_EQ(5, map_node->caller_callsites().size());
-  EXPECT_EQ(1, map_node->callers().size());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
+  EXPECT_EQ(5, entry_node.callsites().size());
+  EXPECT_EQ(1, entry_node.callees().size());
+  EXPECT_TRUE(entry_node.caller_callsites().empty());
+  EXPECT_TRUE(entry_node.callers().empty());
+
+  const CallGraphNode& map_node = call_graph->GetNode(map_computation);
+  EXPECT_EQ(map_computation, map_node.computation());
+  EXPECT_EQ(CallContext::kParallel, map_node.context());
+  EXPECT_TRUE(map_node.callsites().empty());
+  EXPECT_TRUE(map_node.callees().empty());
+  EXPECT_EQ(5, map_node.caller_callsites().size());
+  EXPECT_EQ(1, map_node.callers().size());
 }
 
 TEST_F(CallGraphTest, SequentialComputations) {
@@ -173,27 +166,24 @@ TEST_F(CallGraphTest, SequentialComputations) {
   HloComputation* entry_computation = module.AddEntryComputation(
       MakeCallingComputation(called_computation, /*callsites=*/3));
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(2, call_graph->nodes().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  EXPECT_EQ(entry_computation, entry_node->computation());
-  EXPECT_EQ(CallContext::kSequential, entry_node->context());
-  EXPECT_EQ(3, entry_node->callsites().size());
-  EXPECT_EQ(1, entry_node->callees().size());
-  EXPECT_TRUE(entry_node->caller_callsites().empty());
-  EXPECT_TRUE(entry_node->callers().empty());
-
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* called_node,
-                         call_graph->GetNode(called_computation));
-  EXPECT_EQ(called_computation, called_node->computation());
-  EXPECT_EQ(CallContext::kSequential, called_node->context());
-  EXPECT_TRUE(called_node->callsites().empty());
-  EXPECT_TRUE(called_node->callees().empty());
-  EXPECT_EQ(3, called_node->caller_callsites().size());
-  EXPECT_EQ(1, called_node->callers().size());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
+  EXPECT_EQ(3, entry_node.callsites().size());
+  EXPECT_EQ(1, entry_node.callees().size());
+  EXPECT_TRUE(entry_node.caller_callsites().empty());
+  EXPECT_TRUE(entry_node.callers().empty());
+
+  const CallGraphNode& called_node = call_graph->GetNode(called_computation);
+  EXPECT_EQ(called_computation, called_node.computation());
+  EXPECT_EQ(CallContext::kSequential, called_node.context());
+  EXPECT_TRUE(called_node.callsites().empty());
+  EXPECT_TRUE(called_node.callees().empty());
+  EXPECT_EQ(3, called_node.caller_callsites().size());
+  EXPECT_EQ(1, called_node.callers().size());
 }
 
 TEST_F(CallGraphTest, ContextBothComputations) {
@@ -213,32 +203,29 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   HloComputation* entry_computation =
       module.AddEntryComputation(builder.Build());
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(2, call_graph->nodes().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  EXPECT_EQ(entry_computation, entry_node->computation());
-  EXPECT_EQ(2, entry_node->callsites().size());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(2, entry_node.callsites().size());
 
-  const CallSite& call_callsite = entry_node->callsites()[0];
+  const CallSite& call_callsite = entry_node.callsites()[0];
   EXPECT_EQ(call, call_callsite.instruction());
   EXPECT_THAT(call_callsite.called_computations(),
               UnorderedElementsAre(subcomputation));
   EXPECT_EQ(CallContext::kSequential, call_callsite.context());
-  EXPECT_EQ(entry_node->GetCallSite(call), &call_callsite);
+  EXPECT_EQ(entry_node.GetCallSite(call), &call_callsite);
 
-  const CallSite& map_callsite = entry_node->callsites()[1];
+  const CallSite& map_callsite = entry_node.callsites()[1];
   EXPECT_EQ(map, map_callsite.instruction());
   EXPECT_THAT(map_callsite.called_computations(),
               UnorderedElementsAre(subcomputation));
   EXPECT_EQ(CallContext::kParallel, map_callsite.context());
-  EXPECT_EQ(entry_node->GetCallSite(map), &map_callsite);
+  EXPECT_EQ(entry_node.GetCallSite(map), &map_callsite);
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* sub_node,
-                         call_graph->GetNode(subcomputation));
-  EXPECT_EQ(CallContext::kBoth, sub_node->context());
+  const CallGraphNode& sub_node = call_graph->GetNode(subcomputation);
+  EXPECT_EQ(CallContext::kBoth, sub_node.context());
 }
 
 TEST_F(CallGraphTest, ComplexGraph) {
@@ -284,27 +271,24 @@ TEST_F(CallGraphTest, ComplexGraph) {
     entry_computation = module.AddEntryComputation(builder.Build());
   }
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(5, call_graph->nodes().size());
 
   // Entry computation has one while instruction calling two computations
   // (cond_computation and a_computation).
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  ASSERT_EQ(1, entry_node->callsites().size());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  ASSERT_EQ(1, entry_node.callsites().size());
   const std::vector<HloComputation*>& called_computations =
-      entry_node->callsites()[0].called_computations();
+      entry_node.callsites()[0].called_computations();
   EXPECT_THAT(called_computations,
               UnorderedElementsAre(cond_computation, a_computation));
-  EXPECT_EQ(CallContext::kSequential, entry_node->context());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* c_node,
-                         call_graph->GetNode(c_computation));
-  EXPECT_TRUE(c_node->callsites().empty());
-  EXPECT_THAT(c_node->callers(),
+  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
+  EXPECT_TRUE(c_node.callsites().empty());
+  EXPECT_THAT(c_node.callers(),
               UnorderedElementsAre(a_computation, b_computation));
-  EXPECT_EQ(CallContext::kBoth, c_node->context());
+  EXPECT_EQ(CallContext::kBoth, c_node.context());
 
   // Visit the graph and verify nodes were visited in callee-before-caller
   // order.
@@ -337,8 +321,7 @@ TEST_F(CallGraphTest, VisitSingletonComputation) {
   HloModule module(TestName());
   HloComputation* computation =
       module.AddEntryComputation(MakeScalarComputation());
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
 
   std::vector<HloComputation*> visited;
   TF_ASSERT_OK(call_graph->VisitNodes([&visited](const CallGraphNode& node) {
@@ -355,8 +338,7 @@ TEST_F(CallGraphTest, VisitUnreachableComputation) {
       module.AddEntryComputation(MakeScalarComputation());
   HloComputation* unreachable_computation =
       module.AddEmbeddedComputation(MakeScalarComputation());
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
 
   // Test visitation of only reachable nodes.
   {
@@ -390,8 +372,7 @@ TEST_F(CallGraphTest, VisitWithError) {
   // Test that the call graph visitor properly propagates errors.
   HloModule module(TestName());
   module.AddEntryComputation(MakeScalarComputation());
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
 
   Status status = call_graph->VisitNodes(
       [](const CallGraphNode&) { return InternalError("Visitation failed"); });
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac1906c88c47a1efff305f2a45de66b84048af37
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -0,0 +1,131 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
+CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
+  ServiceOptions default_options;
+  default_options.set_platform(platform);
+  return NewService(default_options);
+}
+
+/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
+CompileOnlyService::NewService(const ServiceOptions& options) {
+  perftools::gputools::Platform* platform = options.platform();
+  if (platform == nullptr) {
+    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
+  }
+
+  TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
+                      CreateComputeConstantBackend());
+  std::unique_ptr<CompileOnlyService> service(
+      new CompileOnlyService(compiler, std::move(compute_constant_backend)));
+  return std::move(service);
+}
+
+CompileOnlyService::CompileOnlyService(
+    Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend)
+    : Service(/*backend=*/nullptr, std::move(compute_constant_backend)),
+      compiler_(compiler) {
+  runs_in_client_process_ = true;
+}
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyService::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<std::unique_ptr<HloModule>> hlo_modules;
+  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
+  for (const AotComputationInstance& instance : computations) {
+    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
+                        computation_tracker_.Resolve(instance.computation));
+    VersionedComputationHandle versioned_handle =
+        user_computation->GetVersionedHandle();
+
+    // Dump computation proto state if flag is set.
+    legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
+    const string& directory_path = flags->xla_dump_computations_to;
+    if (!directory_path.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<SessionModule> session_module,
+          computation_tracker_.SnapshotComputation(versioned_handle.handle));
+      string filename = tensorflow::strings::StrCat(
+          "computation_", versioned_handle.handle.handle(), "__",
+          session_module->entry().name(), "__version_",
+          versioned_handle.version);
+      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
+                                                     *session_module));
+    }
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                        computation_tracker_.BuildHloModule(
+                            versioned_handle,
+                            /*include_unreachable_instructions=*/true));
+    hlo_modules.push_back(std::move(hlo_module));
+
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<const ProgramShape> program_shape,
+        user_computation->ComputeProgramShape(versioned_handle.version));
+
+    module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
+    HloModuleConfig* module_config = module_configs.back().get();
+    auto* computation_layout =
+        module_config->mutable_entry_computation_layout();
+    if (flags->xla_hlo_profile) {
+      module_config->enable_hlo_profiling(true);
+    }
+    for (int i = 0; i < instance.argument_layouts.size(); ++i) {
+      const Shape& argument_layout = *instance.argument_layouts[i];
+      if (ShapeUtil::IsTuple(argument_layout)) {
+        return Unimplemented("tuple arguments not supported yet");
+      }
+      TF_RETURN_IF_ERROR(
+          computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
+              argument_layout));
+    }
+    TF_RETURN_IF_ERROR(
+        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
+            *instance.result_layout));
+  }
+
+  return compiler_->CompileAheadOfTime(std::move(hlo_modules),
+                                       std::move(module_configs),
+                                       MakeHloDumper(), options);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dae49e3e1acf144847d44af4507880d8bf2efc4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/service.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// An XLA Service specialization for ahead-of-time compilation.  This only
+// instantiates a Compiler object for the relevant platform; it does not
+// instantiate or require an execution backend.
+class CompileOnlyService : public Service {
+ public:
+  // Factory for creating a CompileOnlyService. The parameter platform is the
+  // platform that the service should target. If platform is null then the
+  // default platform is used.
+  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      perftools::gputools::Platform* platform);
+  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      const ServiceOptions& options);
+
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AotComputationInstance {
+    ComputationHandle computation;
+    std::vector<const Shape*> argument_layouts;
+    const Shape* result_layout = nullptr;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation.  See
+  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+      const AotCompilationOptions& Options);
+
+  // Override Service methods that require or imply the existence of an
+  // execute backend.  Note that this does not include TransferToClient and
+  // TransferToClientInProcess, as computing contants produces global data
+  // that we may wish to transfer.
+  tensorflow::Status Execute(const ExecuteRequest* arg,
+                             ExecuteResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                                     ExecuteParallelResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status GetDeviceHandles(
+      const GetDeviceHandlesRequest* arg,
+      GetDeviceHandlesResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support devices.");
+  }
+  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                                  ExecuteAsyncResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status WaitForExecution(
+      const WaitForExecutionRequest* arg,
+      WaitForExecutionResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status TransferToServer(
+      const TransferToServerRequest* arg,
+      TransferToServerResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferToInfeed(
+      const TransferToInfeedRequest* arg,
+      TransferToInfeedResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferToServerInProcess(
+      const TransferToServerInProcessRequest* arg,
+      TransferToServerInProcessResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
+                                 ResetDeviceResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support devices.");
+  }
+
+ private:
+  explicit CompileOnlyService(
+      Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend);
+  CompileOnlyService(const CompileOnlyService&) = delete;
+  void operator=(const CompileOnlyService&) = delete;
+
+  // The compiler for the target platform.  This is included in place of
+  // the Service::execute_backend_'s compiler, since execute_backend_ is a
+  // nullptr in CompileOnlyService.
+  Compiler* compiler_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 7db28aed3cd2045d6c1e94a390ce632bf3bbe9de..907b0307d4b61018814b02737fba4837c2e1d668 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -319,6 +320,7 @@ Status InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
           if (liveness.MayInterfere(*instruction_buffer, *other_buffer)) {
             VLOG(2) << "Adding copy of buffer for instruction: "
                     << instruction_->name()
+                    << " instruction_buffer: " << instruction_buffer->ToString()
                     << " at index: " << tensorflow::str_util::Join(index, ",")
                     << " because of interference with buffer: "
                     << other_buffer->ToString();
@@ -351,6 +353,11 @@ Status InstructionCopier::RecordControlPredecessors(
           for (const BufferAlias& alias :
                points_to_analysis.GetBufferAliases(*buffer)) {
             for (HloInstruction* user : alias.instruction()->users()) {
+              if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
+                                          user, points_to_analysis)) {
+                continue;
+              }
+
               if (user != instruction_) {
                 control_predecessors_.mutable_element(index)->push_back(user);
               }
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 6e24506b383c61a3e346d3e3250511cd6a2d4940..affb5f99066d8278c583c469d97e78646d52f3c6 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -53,7 +53,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/port:initialize",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
@@ -98,6 +97,7 @@ cc_library(
     name = "simple_orc_jit",
     srcs = ["simple_orc_jit.cc"],
     hdrs = ["simple_orc_jit.h"],
+    linkopts = ["-ldl"],
     deps = [
         ":compiler_functor",
         ":cpu_runtime",
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index f717d57839f4cfc59121b9e8e39b5b9c63c9b60d..b42702dbe1abe3db838159bda2665743e416a2d5 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -28,6 +29,8 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
+using ::testing::ElementsAre;
+
 class ConvCanonicalizationTest : public HloTestBase {
  public:
   ConvCanonicalizationTest() {
@@ -96,14 +99,14 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
 
   // The input is in CNHW order. input_reshape should produce
   // NHWC for the convolution to hit the Eigen fast path.
-  EXPECT_TRUE(ContainersEqual(input_reshape->dimensions(), {1, 2, 3, 0}));
+  EXPECT_THAT(input_reshape->dimensions(), ElementsAre(1, 2, 3, 0));
   // The kernel is in OIHW order. kernel_reshape should produce
   // HWIO for the convolution to hit the Eigen fast path.
-  EXPECT_TRUE(ContainersEqual(kernel_reshape->dimensions(), {2, 3, 1, 0}));
+  EXPECT_THAT(kernel_reshape->dimensions(), ElementsAre(2, 3, 1, 0));
   // The output of the canonical convolution is in NHWC order (the same as
   // input_reshape's order). output_reshape should restore that order to the
   // order of the computation root (CNHW).
-  EXPECT_TRUE(ContainersEqual(output_reshape->dimensions(), {3, 0, 1, 2}));
+  EXPECT_THAT(output_reshape->dimensions(), ElementsAre(3, 0, 1, 2));
 }
 
 TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 3e6be5a7a2374d4d274f95b7b8e2d814f8ace8b1..1ba45e59838c10ab5c050cb74e263eca70783fb0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/port/initialize.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
@@ -683,8 +682,10 @@ int64 CpuCompiler::ShapeSizeBytes(const Shape& shape) const {
 }  // namespace cpu
 }  // namespace xla
 
-REGISTER_MODULE_INITIALIZER(cpu_compiler, {
+static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() {
     return xla::MakeUnique<xla::cpu::CpuCompiler>();
   });
-});
+  return true;
+}
+static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 240da35ef190eb7080947ab7d1da91d8d2dd8973..dc002846e9e6b07c767ddc8af939657c4c51bf23 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -24,6 +24,11 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on CPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
   // Condition for consumer: must be elementwise or a fusion op
   // (which necessarily only contains elementwise operations)
   if (!(consumer->opcode() == HloOpcode::kFusion ||
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 8e06f0520edfb05c7ec606dcb8e85c5ef997c2c0..253de20f25127bf0ac23d5969e0f16c143396e47 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 
-#include <sched.h>
 #include <functional>
 
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 1c704fd1ee77f3effad2b460e955efe53e441310..1e34de9e4bde992154ece2b8ff0783c9fb2b8a1a 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -201,7 +201,8 @@ void IrEmitter::InitializeIrFunction(const string& function_name,
     if (&argument == retval) {
       continue;
     }
-    compute_function_->setDoesNotAlias(argument.getArgNo() + 1);
+    compute_function_->addAttribute(argument.getArgNo() + 1,
+                                    llvm::Attribute::NoAlias);
   }
 
   ir_builder_.SetInsertPoint(llvm::BasicBlock::Create(
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index 7a4723e8d75588d8ccb711892b4082024695e444..cadad10910132c716eefd4ecfba53f3d7e02df99 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -146,7 +146,7 @@ Status ParallelCpuExecutable::AllocateBuffers(
 }
 
 Status ParallelCpuExecutable::ExecuteComputeFunctions(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
@@ -160,7 +160,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
 }
 
 Status ParallelCpuExecutable::ExecuteComputeFunctions(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
@@ -214,7 +214,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
 
   void** temps_array = buffer_pointers.data();
   uint64* profile_counters_array = profile_counters.data();
-  auto* thread_pool = CHECK_NOTNULL(run_options->inter_op_thread_pool());
+  auto* thread_pool = CHECK_NOTNULL(run_options->xla_intra_op_thread_pool());
   tensorflow::mutex completion_queue_lock;
   tensorflow::condition_variable completion_queue_cv;
   std::deque<HloInstruction*> completion_queue;
@@ -251,11 +251,12 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
                      });
       auto function = FindOrDie(functions, instruction);
       // The thread pool entry takes ownership of |operand_buffers|.
+      const auto* exec_run_options = &run_options->run_options();
       thread_pool->Schedule([instruction, &completion_queue,
                              &completion_queue_lock, &completion_queue_cv,
-                             result_buffer, run_options, operand_buffers,
+                             result_buffer, exec_run_options, operand_buffers,
                              temps_array, profile_counters_array, function] {
-        function(result_buffer, run_options, operand_buffers, temps_array,
+        function(result_buffer, exec_run_options, operand_buffers, temps_array,
                  profile_counters_array);
         delete[] operand_buffers;
         // Push the completed HLO instruction on the queue, the main thread
@@ -345,9 +346,8 @@ ParallelCpuExecutable::ExecuteOnStream(
   const BufferAllocation::Index result_index = result_slice.index();
   VLOG(3) << "result index: " << result_index;
 
-  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(&run_options->run_options(),
-                                             arguments, device_allocations,
-                                             hlo_execution_profile));
+  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(
+      run_options, arguments, device_allocations, hlo_execution_profile));
 
   // Mark the buffers that are actually live (used in the output) when the
   // computation finishes executing.
@@ -400,8 +400,8 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
 
-  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(
-      &run_options->run_options(), arguments, buffers, hlo_execution_profile));
+  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(run_options, arguments, buffers,
+                                             hlo_execution_profile));
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer which is returned to the caller.
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index 7223de9f0798365138cdb26ca9dce07cd0e474e3..6e1239d590c1f5698066cd77b5637912e14264e7 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -96,14 +96,14 @@ class ParallelCpuExecutable : public Executable {
   // Calls the generated functions in 'function_names_', performing the
   // computation with the given arguments using the supplied buffers.
   Status ExecuteComputeFunctions(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           buffers,
       HloExecutionProfile* hlo_execution_profile);
   Status ExecuteComputeFunctions(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           buffers,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index 677080a8623224cdd65e35b3116ae57b7b3b3ca2..332f4216dc7b970cb985719ef82d5aa82bb86d3d 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -53,8 +53,8 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
   typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
   int lhs_contract_dim = transpose_lhs ? 0 : 1;
   int rhs_contract_dim = transpose_rhs ? 1 : 0;
-  const Eigen::array<DimPair, 1> dims(
-      DimPair(lhs_contract_dim, rhs_contract_dim));
+  const Eigen::array<DimPair, 1> dims({
+      DimPair(lhs_contract_dim, rhs_contract_dim) });
 
   // Matrix multiply is a special case of the "contract" operation where
   // the contraction is performed along dimension 1 of the lhs and dimension
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index 384a978873de89526f43556296aaa51c46ac1d3f..e45329c4ef52090c4d8b50c1afc452d0dadceb35 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -47,8 +47,8 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
   typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
   int lhs_contract_dim = transpose_lhs ? 0 : 1;
   int rhs_contract_dim = transpose_rhs ? 1 : 0;
-  const Eigen::array<DimPair, 1> dims(
-      DimPair(lhs_contract_dim, rhs_contract_dim));
+  const Eigen::array<DimPair, 1> dims({
+      DimPair(lhs_contract_dim, rhs_contract_dim)});
 
   // Matrix multiply is a special case of the "contract" operation where
   // the contraction is performed along dimension 1 of the lhs and dimension
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index a04815dad94484a6f01ebd27d3ec73f547086722..bea1da4044669f5e910af09ba1b65416a69367b5 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -240,14 +240,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
       return ir_builder_->CreateFDiv(lhs_value, rhs_value);
     case HloOpcode::kRemainder:
       return ir_builder_->CreateFRem(lhs_value, rhs_value);
-
-    // The 'O' prefix on the LLVM ops means "ordered" compare where comparisons
-    // with NAN always return false.
+    // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered
+    // comparisons always return false when one of the operands is NaN, whereas
+    // unordered comparisons return true.
+    //
+    // We use ordered comparisons for everything except kNe, where we use an
+    // unordered comparison.  This makes x != y equivalent to !(x == y), and
+    // matches C++'s semantics.
     case HloOpcode::kEq:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, lhs_value,
                                      rhs_value, ir_builder_);
     case HloOpcode::kNe:
-      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_ONE, lhs_value,
+      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, lhs_value,
                                      rhs_value, ir_builder_);
     case HloOpcode::kLt:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLT, lhs_value,
@@ -739,11 +743,11 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
           const HloInstruction* operand = hlo->operand(operand_idx);
           auto true_block = llvm_ir::CreateBasicBlock(
               exit_block, tensorflow::strings::StrCat(
-                              "concat_index_from_operand", operand_idx),
+                      "concat_index_from_operand", operand_idx),
               ir_builder_);
           auto false_block = llvm_ir::CreateBasicBlock(
               exit_block, tensorflow::strings::StrCat(
-                              "concat_index_not_from_operand", operand_idx),
+                      "concat_index_not_from_operand", operand_idx),
               ir_builder_);
           auto concat_dim_size =
               llvm::ConstantInt::get(source_index[concat_dim]->getType(),
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index 3c41fe870f2109a16f4d47aee5195a5537380bcb..297a4f7599f9c127386b2f53f7ffb987befc456e 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -102,8 +102,7 @@ Status FlattenNode(const CallGraphNode& node) {
 StatusOr<bool> FlattenCallGraph::Run(HloModule* module) {
   XLA_VLOG_LINES(3, "Before flatten call graph:\n" + module->ToString());
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<CallGraph> call_graph,
-                      CallGraph::Build(module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
   TF_RETURN_IF_ERROR(call_graph->VisitNodes(FlattenNode));
 
   XLA_VLOG_LINES(3, "After flatten call graph:\n" + module->ToString());
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index 6c4a48bbe8e096c00b4ebc2e991a8ff38c06a07b..4e03a96fb3f03710cd3062a79aa4955311cf19c1 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -141,11 +141,9 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
   {
     TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(&module));
     EXPECT_TRUE(result);
-    TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> flat_call_graph,
-                           CallGraph::Build(&module));
-    TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* c_node,
-                           flat_call_graph->GetNode(c_computation));
-    EXPECT_EQ(1, c_node->caller_callsites().size());
+    std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(&module);
+    const CallGraphNode& c_node = flat_call_graph->GetNode(c_computation);
+    EXPECT_EQ(1, c_node.caller_callsites().size());
   }
 }
 
@@ -178,21 +176,17 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
   }
 
   {
-    TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                           CallGraph::Build(&module));
-    TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* cond_node,
-                           call_graph->GetNode(cond_computation));
-    EXPECT_EQ(2, cond_node->caller_callsites().size());
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
+    const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
+    EXPECT_EQ(2, cond_node.caller_callsites().size());
   }
 
   {
     TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(&module));
     EXPECT_TRUE(result);
-    TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                           CallGraph::Build(&module));
-    TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* cond_node,
-                           call_graph->GetNode(cond_computation));
-    EXPECT_EQ(1, cond_node->caller_callsites().size());
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
+    const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
+    EXPECT_EQ(1, cond_node.caller_callsites().size());
   }
 }
 
@@ -219,17 +213,14 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
 
   TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(&module));
   EXPECT_TRUE(result);
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(7, module.computations().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* c_node,
-                         call_graph->GetNode(c_computation));
-  EXPECT_EQ(1, c_node->caller_callsites().size());
+  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
+  EXPECT_EQ(1, c_node.caller_callsites().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* b_node,
-                         call_graph->GetNode(b_computation));
-  EXPECT_EQ(1, b_node->caller_callsites().size());
+  const CallGraphNode& b_node = call_graph->GetNode(b_computation);
+  EXPECT_EQ(1, b_node.caller_callsites().size());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 1fdbcfe5641ab3c0cda63268082069df765bf4e6..d26f415fd4bdfec597c70b760942cc406a0d6cfa 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -264,6 +264,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 1667ab36792c91cbbf3c6396a673bedff2208045..e57eb0bdee64948290d5eaf15965afcdc8bea0ad 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -113,7 +113,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
     PrimitiveType output_type) const {
-  // Binary math functions tranform are of type [T] -> T.
+  // Binary math functions transform are of type [T] -> T.
   for (PrimitiveType input_type : input_types) {
     if (output_type != input_type) {
       return Unimplemented("Input type ≠ output type: %s ≠ %s",
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 34a44ad40548272a0c2a87efadfa1ab2aca7b979..a36dcbbd2faf3258ec2790f51bb2aec3ce834a6c 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -46,6 +46,11 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on GPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
   // RNG operations are not currently parallel-friendly on GPU.
   if (producer->opcode() == HloOpcode::kRng) {
     return false;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index e8378a7f447cebf8d491e98595188d2391333c58..c6e8a2f78b5a398d9e9d5a684ac4d42520ec20c8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -59,6 +59,11 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
 }  // namespace
 
 bool ImplementedAsGemm(const HloInstruction& hlo) {
+  // We can only do this if the HLO is unnested.
+  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
+    return false;
+  }
+
   // For certain types of Dot, we can call pre-canned BLAS gemm.
   if (hlo.opcode() == HloOpcode::kDot) {
     const Shape& lhs_shape = hlo.operand(0)->shape();
@@ -85,6 +90,11 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
 }
 
 bool ImplementedAsDnnConvolution(const HloInstruction& hlo) {
+  // We can only do this if the HLO is unnested.
+  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
+    return false;
+  }
+
   // Forward convolution.
   if (hlo.opcode() == HloOpcode::kConvolution) {
     const ConvolutionDimensionNumbers& dnums =
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index e60978df0a2a9c7911c71314e5325ee0fbfd67e0..36619a845413b19ec2d559252409dae1b96b76e4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -399,7 +399,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot,
   llvm::Type* accum_type = target_array.GetElementLlvmType();
   llvm::Value* accum_address = llvm_ir::EmitAllocaAtFunctionEntry(
       accum_type,       // The pointee type of the alloca instruction.
-      "accum_address",  // The name of the alloca instuction.
+      "accum_address",  // The name of the alloca instruction.
       &ir_builder_);
 
   // Initialize the accumulator in the preheader to zero.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 04babcca0c822700e4e47c66433e8d3ea6ac3d39..e52e55a1a8199019e2c149a777a4e948f830ce0e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -196,7 +196,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
           ir_emitter_context_->buffer_assignment().GetTempAllocation()) {
     kernel->addDereferenceableAttr(temp_buffer_arg_no + 1, allocation->size());
   }
-  kernel->setDoesNotAlias(temp_buffer_arg_no + 1);
+  kernel->addAttribute(temp_buffer_arg_no + 1, llvm::Attribute::NoAlias);
 
   // Add the declaration of this kernel to llvm.nvvm.annotations so that NVPTX
   // treats it as a CUDA kernel.
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 485216837dc727bfe8565ff22678dd2fa470bc40..383729185df14404c4479993a7cdec771a63b26e 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -396,7 +396,7 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
 
   // The LLVM IR verifier performs sanity checking on the IR. This helps
   // discover problems and report them in a meaningful manner, rather than let
-  // later passes report obscure assertions becasue of unfulfilled invariants.
+  // later passes report obscure assertions because of unfulfilled invariants.
   module_passes.add(llvm::createVerifierPass());
 
   // Create the function-level pass manager. It needs data layout information
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index 46a5d303b74af6b312b9e7d774dd484336322b4e..61bc6f6055740a3632ddd1cad94491de97309ae6 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -168,7 +168,7 @@ class MatcherBase {
   virtual ~MatcherBase() {}
 
   // Attempts to match each ExprTree in 'expr_trees_'.
-  // Returns OK on the first succesful match, error status otherwise.
+  // Returns OK on the first successful match, error status otherwise.
   virtual tensorflow::Status Run() {
     Status status;
     for (const ExprTree& expr_tree : expr_trees_) {
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 9c4899a67debfebf72b93b412a07ad60993fd819..d7aa5664df40f24d17b48e846839c22cf7922f75 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -53,12 +53,44 @@ std::vector<const LogicalBuffer*> UniqueOperandSourceBuffers(
 
 /*static*/
 StatusOr<HeapSimulator::Result> HeapSimulator::Run(
-    std::unique_ptr<HeapAlgorithm> algorithm,
+    std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+    const SequentialHloOrdering::HloModuleSequence& module_sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_fn,
+    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
+  const HloComputation* entry_computation = module.entry_computation();
+  const std::vector<const HloInstruction*>& instruction_sequence =
+      FindOrDie(module_sequence, entry_computation);
+  TF_RETURN_IF_ERROR(heap.RunComputation(*entry_computation,
+                                         instruction_sequence,
+                                         points_to_analysis, &module_sequence));
+  return heap.Finish();
+}
+
+/*static*/
+StatusOr<HeapSimulator::Result> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
-    const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_fn,
     const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
+  TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
+                                         points_to_analysis,
+                                         /*module_sequence=*/nullptr));
+  return heap.Finish();
+}
+
+// Runs a heap simulation for the given 'computation', assuming the given
+// 'instruction_sequence'. If 'module_sequence' is non-null, it is used to find
+// kCall and kWhile sub-computations, and the heap simulation for those
+// sub-computations will be run recursively.
+Status HeapSimulator::RunComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& instruction_sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const SequentialHloOrdering::HloModuleSequence* module_sequence) {
   // The goal here is to minimize memory usage, assuming the given sequential
   // ordering of instructions.  The strategy is to walk through the instruction
   // sequence, calling Alloc and Free on the underlying heap algorithm.  The
@@ -67,7 +99,6 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
   // 'live_buffers' tracks the liveness of each buffer that we assign, by
   // associating it with a set of HloInstructions that need to be visited.  When
   // the set becomes empty, the buffer is no longer used, and can be freed.
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
   FlatMap<const LogicalBuffer*, FlatSet<const HloInstruction*>> live_buffers;
 
   const HloInstruction* root = computation.root_instruction();
@@ -90,7 +121,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     // lifetime of buffers that aren't already connected by a data dependency.
     std::vector<const LogicalBuffer*> dead_buffers_to_free;
     for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
-      if (heap.IgnoreBuffer(buffer)) {
+      if (IgnoreBuffer(buffer)) {
         continue;
       }
       for (const BufferAlias& alias :
@@ -127,7 +158,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::vector<const LogicalBuffer*> operand_buffers_to_free;
     for (const LogicalBuffer* operand_buffer :
          UniqueOperandSourceBuffers(instruction, points_to_analysis)) {
-      if (heap.IgnoreBuffer(operand_buffer)) {
+      if (IgnoreBuffer(operand_buffer)) {
         continue;
       }
       live_buffers[operand_buffer].erase(instruction);
@@ -142,10 +173,10 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     // happen before dead or operand buffers are freed; the instruction reads
     // the operand buffers to produce its output.
     //
-    // INVARIANT: Either heap.Alloc or heap.ShareBuffer will be called for each
-    // buffer that we should assign.
+    // INVARIANT: Either Alloc or ShareBuffer will be called for each buffer
+    // that we should assign.
     for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
-      if (heap.IgnoreBuffer(buffer)) {
+      if (IgnoreBuffer(buffer)) {
         continue;
       }
 
@@ -159,24 +190,50 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
             CanShareOperandBufferWithUser(
                 operand_buffer->instruction(), operand_buffer->index(),
                 buffer->instruction(), buffer->index(), points_to_analysis)) {
-          heap.ShareBuffer(buffer, operand_buffer);
+          ShareBuffer(buffer, operand_buffer);
           shared = true;
           break;
         }
       }
 
       if (!shared) {
-        heap.Alloc(buffer);
+        Alloc(buffer);
       }
     }
 
+    // If the whole module is sequential, we can save memory by running the
+    // heap-simulation for sub-computations inline. E.g. the buffers for the
+    // condition and body of a kWhile instruction are only live for the duration
+    // of the instruction itself.
+    //
+    // The order that the sub-computations are simulated does not affect
+    // correctness; since the whole module is sequential, we know that the
+    // sub-computations will never be run concurrently.
+    if (module_sequence != nullptr) {
+      if (instruction->opcode() == HloOpcode::kCall ||
+          instruction->opcode() == HloOpcode::kWhile) {
+        for (const HloComputation* called_computation :
+             instruction->called_computations()) {
+          const std::vector<const HloInstruction*>& called_sequence =
+              FindOrDie(*module_sequence, called_computation);
+          TF_RETURN_IF_ERROR(RunComputation(*called_computation,
+                                            called_sequence, points_to_analysis,
+                                            module_sequence));
+        }
+      }
+
+      // Other sub-computations (e.g. Map, Reduce, ...) are skipped; they are
+      // assigned "thread-local" allocations, meaning their buffers are not
+      // allocated up-front at the beginning of the computation.
+    }
+
     // Free buffers that are no longer live.  This is the earliest point that we
     // can de-allocate; right after the last use of the buffer.
     for (const LogicalBuffer* buffer : dead_buffers_to_free) {
-      heap.Free(buffer);
+      Free(buffer);
     }
     for (const LogicalBuffer* buffer : operand_buffers_to_free) {
-      heap.Free(buffer);
+      Free(buffer);
     }
   }
 
@@ -187,10 +244,10 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     const FlatSet<const HloInstruction*>& pending = buffer_pending.second;
     CHECK_EQ(pending.size(), 1) << *buffer;
     CHECK(*pending.begin() == nullptr) << *buffer;
-    heap.Free(buffer);
+    Free(buffer);
   }
 
-  return heap.Finish();
+  return Status::OK();
 }
 
 HeapSimulator::HeapSimulator(
@@ -309,6 +366,11 @@ HeapSimulator::Result HeapSimulator::Finish() {
         result.chunk_map.emplace(buffer, chunk);
       }
     }
+    // If we were told to assign specific buffers, make sure we've assigned
+    // exactly that many buffers.
+    if (buffers_to_assign_ != nullptr) {
+      CHECK_EQ(buffers_to_assign_->size(), result.chunk_map.size());
+    }
   }
 
   // Fragmentation is the difference between the actual and ideal sizes.
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 0ce2906767898bcace45e296d76f958c50a2b3a7..3d98046261902b41a17a8ab0f9a349634a1e4545 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -63,17 +64,32 @@ class HeapSimulator {
   };
 
   // Run the heap simulation with the given algorithm, assuming the given
-  // sequential ordering of instructions.  The 'instruction_sequence' must
-  // contain a topologically-consistent total ordering of all instructions in
-  // the computation.  The result is invalid if instructions are not run in
-  // exactly this sequence.
+  // module_sequence, which must contain a topologically-consistent total
+  // ordering of all instructions within each computation. The result is invalid
+  // if instructions are not run in exactly this sequence.
+  //
+  // Running heap simulation on the whole module tends to save memory, compared
+  // to running on a per-computation basis, since we can re-use buffer space for
+  // called sub-computations.
   //
   // If 'buffers_to_assign' is provided, only those buffers are assigned
   // offsets, otherwise all buffers defined by the instructions are assigned.
+  static StatusOr<Result> Run(
+      std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+      const SequentialHloOrdering::HloModuleSequence& module_sequence,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const LogicalBuffer::SizeFunction& size_fn,
+      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
+          nullptr);
+
+  // Same as above, but runs on a single computation. The 'instruction_sequence'
+  // must contain a topologically-consistent total ordering of all instructions
+  // in the computation. The result is invalid if instructions are not run in
+  // exactly this sequence.
   static StatusOr<Result> Run(
       std::unique_ptr<HeapAlgorithm> algorithm,
-      const std::vector<const HloInstruction*>& instruction_sequence,
       const HloComputation& computation,
+      const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
@@ -86,6 +102,12 @@ class HeapSimulator {
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign);
   ~HeapSimulator();
 
+  Status RunComputation(
+      const HloComputation& computation,
+      const std::vector<const HloInstruction*>& instruction_sequence,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const SequentialHloOrdering::HloModuleSequence* module_sequence);
+
   bool IgnoreBuffer(const LogicalBuffer* buffer) const;
   void Alloc(const LogicalBuffer* buffer);
   void Free(const LogicalBuffer* buffer);
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 874bd5f1060c179d5547510c351909069aa935b8..0a6900f73304f7a7b1209807fd3a1e8220484e03 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -19,13 +19,16 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 namespace {
@@ -69,6 +72,7 @@ class HeapCallRecorder : public HeapAlgorithm {
 // sequence against an expected sequence.
 class HeapSimulatorTracker {
  public:
+  // Constructor for testing a single entry computation.
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
       const std::vector<const HloInstruction*>& instruction_sequence) {
@@ -83,12 +87,48 @@ class HeapSimulatorTracker {
     auto zero_size = [](const LogicalBuffer& buffer) { return 0; };
     auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
         MakeUnique<HeapCallRecorder>(&actual_calls_));
-    result_ = HeapSimulator::Run(std::move(algorithm), instruction_sequence,
-                                 *module_->entry_computation(),
-                                 *points_to_analysis_, zero_size)
+    result_ = HeapSimulator::Run(
+                  std::move(algorithm), *module_->entry_computation(),
+                  instruction_sequence, *points_to_analysis_, zero_size)
                   .ConsumeValueOrDie();
   }
 
+  explicit HeapSimulatorTracker(const string& name) {
+    module_ = MakeUnique<HloModule>(name);
+  }
+
+  // Similar to the single entry computation constructor above, but runs the
+  // simulation over the entire module.
+  void RunWholeModule(
+      const std::vector<const HloInstruction*>& full_module_sequence) {
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+
+    // Construct the module sequence grouped by computation.
+    SequentialHloOrdering::HloModuleSequence module_sequence;
+    tensorflow::gtl::FlatMap<const HloInstruction*, int> reverse_position;
+    for (int i = 0; i < full_module_sequence.size(); ++i) {
+      const HloInstruction* instruction = full_module_sequence[i];
+      module_sequence[instruction->parent()].push_back(instruction);
+      reverse_position[instruction] = full_module_sequence.size() - i;
+    }
+
+    // Hack the size_fn so that it returns a decreasing value as we step through
+    // the sequence. This lets us ensure the Alloc calls are in the sequence
+    // order. The Free calls are sorted by LogicalBuffer.id, which is at least
+    // deterministic.
+    auto size_fn = [&reverse_position](const LogicalBuffer& buffer) {
+      return reverse_position[buffer.instruction()];
+    };
+    auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
+        MakeUnique<HeapCallRecorder>(&actual_calls_));
+    result_ = HeapSimulator::Run(std::move(algorithm), *module_,
+                                 module_sequence, *points_to_analysis_, size_fn)
+                  .ConsumeValueOrDie();
+  }
+
+  HloModule* module() { return module_.get(); }
+
   // Returns the buffer defined at the given instruction and index.
   const LogicalBuffer* BufferAt(const HloInstruction* instruction,
                                 const ShapeIndex& index) const {
@@ -358,6 +398,86 @@ TEST_F(HeapSimulatorTest, MultiplyDotDotTuple) {
   });
 }
 
+TEST_F(HeapSimulatorTest, WholeModule) {
+  HeapSimulatorTracker tracker(TestName());
+
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
+  HloInstruction* cond_data = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                   HloOpcode::kLt, cond_iter, cond_data));
+  HloComputation* cond_computation =
+      tracker.module()->AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloComputation* body_computation =
+      tracker.module()->AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, param));
+  tracker.module()->AddEntryComputation(builder.Build());
+
+  tracker.RunWholeModule(
+      {param, while_op, body_param, cond_param, cond_iter, cond_data, cond_lt});
+  tracker.ExpectCallSequence({
+      // The entry computation param and while_op are allocated first.
+      {kAlloc, tracker.BufferAt(param, {})},
+      {kAlloc, tracker.BufferAt(param, {0})},
+      {kAlloc, tracker.BufferAt(param, {1})},
+      {kAlloc, tracker.BufferAt(while_op, {})},
+      {kAlloc, tracker.BufferAt(while_op, {0})},
+      {kAlloc, tracker.BufferAt(while_op, {1})},
+
+      // Now the while body param is allocated and freed.
+      {kAlloc, tracker.BufferAt(body_param, {})},
+      {kAlloc, tracker.BufferAt(body_param, {0})},
+      {kAlloc, tracker.BufferAt(body_param, {1})},
+      {kFree, tracker.BufferAt(body_param, {})},
+      {kFree, tracker.BufferAt(body_param, {0})},
+      {kFree, tracker.BufferAt(body_param, {1})},
+
+      // Now the while cond param is allocated. The GTE instructions just alias
+      // the param elements, so the param tuple can immediately be freed.
+      {kAlloc, tracker.BufferAt(cond_param, {})},
+      {kAlloc, tracker.BufferAt(cond_param, {0})},
+      {kAlloc, tracker.BufferAt(cond_param, {1})},
+      {kFree, tracker.BufferAt(cond_param, {})},
+
+      // Now the final cond less-than buffer is allocated.
+      {kAlloc, tracker.BufferAt(cond_lt, {})},
+
+      // The order of the remaining Free calls is based on the LogicalBuffer.id,
+      // which is deterministic, but not obvious.
+      {kFree, tracker.BufferAt(param, {})},
+      {kFree, tracker.BufferAt(param, {0})},
+      {kFree, tracker.BufferAt(param, {1})},
+
+      {kFree, tracker.BufferAt(while_op, {})},
+      {kFree, tracker.BufferAt(while_op, {0})},
+      {kFree, tracker.BufferAt(while_op, {1})},
+
+      {kFree, tracker.BufferAt(cond_param, {0})},
+      {kFree, tracker.BufferAt(cond_param, {1})},
+      {kFree, tracker.BufferAt(cond_lt, {})},
+
+      {kFinish, nullptr},
+  });
+}
+
 // Base class for heap algorithm tests.
 class HeapAlgorithmTestBase : public ::testing::Test {
  protected:
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 89371e44973a55811af436b1f1d42f8f40b02159..a749814f0dfbfbacb7c09be815ef572bb00687c0 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -35,10 +35,14 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
+using ::tensorflow::strings::StrCat;
+
 std::unique_ptr<HloComputation> HloComputation::Builder::Build(
     HloInstruction* root_instruction) {
   int parameter_count = 0;
@@ -91,12 +95,7 @@ HloInstruction* HloComputation::AddInstruction(
 HloInstruction* HloComputation::AddInstructionInternal(
     std::unique_ptr<HloInstruction> instruction) {
   // Generate a unique name for the instruction.
-  instruction->set_name(
-      instruction_name_uniquer_.GetUniqueName(instruction->name()));
-  if (instruction->opcode() == HloOpcode::kParameter) {
-    instruction->set_parameter_name(
-        instruction_name_uniquer_.GetUniqueName(instruction->parameter_name()));
-  }
+  instruction->UniquifyName(&instruction_name_uniquer_);
   Reparent(instruction.get());
   HloInstruction* pinst = instruction.get();
   instruction_iterators_[pinst] =
@@ -131,9 +130,24 @@ Status HloComputation::RemoveParameter(int64 param_no) {
 
   while (param_no < param_instructions_.size()) {
     param_instruction = param_instructions_[param_no];
-    HloInstruction* new_instr = AddInstructionInternal(
-        HloInstruction::CreateParameter(param_no, param_instruction->shape(),
-                                        param_instruction->parameter_name()));
+    string param_name = param_instruction->name();
+    // Fusion parameters are named foo.param_1, bar.param_2, etc. We are
+    // renumbering the parameters so replace the final number in the name with
+    // the updated value.
+    const string param_underscore = ".param_";
+    size_t index = param_name.rfind(param_underscore);
+    if (index == string::npos) {
+      string after_param = name().substr(index + param_underscore.size());
+      int64 numeric_suffix;
+      if (tensorflow::strings::safe_strto64(after_param, &numeric_suffix)) {
+        param_name =
+            StrCat(param_name.substr(0, index), param_underscore, param_no);
+      }
+    }
+
+    HloInstruction* new_instr =
+        AddInstructionInternal(HloInstruction::CreateParameter(
+            param_no, param_instruction->shape(), param_name));
     TF_RETURN_IF_ERROR(param_instruction->ReplaceAllUsesWith(new_instr));
     new_instr->SetParentFusion(root_instruction_->fusion_instruction());
     param_instructions_[param_no] = new_instr;
@@ -672,4 +686,8 @@ std::unique_ptr<HloComputation> HloComputation::Clone(const string& suffix) {
   return result;
 }
 
+void HloComputation::UniquifyName(NameUniquer* name_uniquer) {
+  name_ = name_uniquer->GetUniqueName(name_);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index fa274cfc6331343eeb22684c0d3f5c7f284dec76..62e00a24fbb523e1e30f08141f9e026407a2015d 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -133,7 +133,10 @@ class HloComputation {
   }
 
   const string& name() const { return name_; }
-  void set_name(const string& name) { name_ = name; }
+
+  // Use the given NameUniquer to select a unique name for the computation based
+  // on the computation's existing name.
+  void UniquifyName(NameUniquer* name_uniquer);
 
   // Return a string representation of the computation.
   string ToString(int nested_level = 0) const;
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 9a5345dc13d6db42553e9c343f7c81cd0e6c9d0e..cb0a99d773c57ba9a2fedc2842fe17cd5fe3571e 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -15,16 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 
-#include <list>
-#include <map>
 #include <memory>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -34,52 +32,222 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
+namespace {
+
+template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
+static std::unique_ptr<Literal> ConvertIfTypesMatch(
+    const Literal& src_literal) {
+  CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
+  return LiteralUtil::Convert<
+      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type,
+      typename primitive_util::PrimitiveTypeToNative<
+          primitive_dest_type>::type>(src_literal);
+}
+
+template <PrimitiveType primitive_src_type>
+static std::unique_ptr<Literal> ConvertIfDestTypeMatches(
+    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+  switch (primitive_dest_type) {
+#define CONVERT_IF_TYPES_MATCH(type) \
+  case (type):                       \
+    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
+    CONVERT_IF_TYPES_MATCH(PRED)
+    CONVERT_IF_TYPES_MATCH(S8)
+    CONVERT_IF_TYPES_MATCH(S32)
+    CONVERT_IF_TYPES_MATCH(S64)
+    CONVERT_IF_TYPES_MATCH(U8)
+    CONVERT_IF_TYPES_MATCH(U32)
+    CONVERT_IF_TYPES_MATCH(U64)
+    CONVERT_IF_TYPES_MATCH(F32)
+    CONVERT_IF_TYPES_MATCH(F64)
+#undef CONVERT_IF_TYPES_MATCH
+    // Other types are not yet supported.
+    default:
+      LOG(FATAL) << "Unimplemented: ConvertIfDestTypeMatches for type "
+                 << PrimitiveType_Name(src_literal.shape().element_type());
+  }
+}
+
+static std::unique_ptr<Literal> ConvertIfSrcTypeMatches(
+    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+  switch (src_literal.shape().element_type()) {
+#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
+  case (type):                             \
+    return ConvertIfDestTypeMatches<(type)>(src_literal, primitive_dest_type);
+    CONVERT_IF_DEST_TYPE_MATCHES(PRED)
+    CONVERT_IF_DEST_TYPE_MATCHES(S8)
+    CONVERT_IF_DEST_TYPE_MATCHES(S32)
+    CONVERT_IF_DEST_TYPE_MATCHES(S64)
+    CONVERT_IF_DEST_TYPE_MATCHES(U8)
+    CONVERT_IF_DEST_TYPE_MATCHES(U32)
+    CONVERT_IF_DEST_TYPE_MATCHES(U64)
+    CONVERT_IF_DEST_TYPE_MATCHES(F32)
+    CONVERT_IF_DEST_TYPE_MATCHES(F64)
+#undef CONVERT_IF_DEST_TYPE_MATCHES
+    // Other types are not yet supported.
+    default:
+      LOG(FATAL) << "Unimplemented: ConvertIfSrcTypeMatches for type "
+                 << PrimitiveType_Name(src_literal.shape().element_type());
+  }
+}
+
+}  // namespace
+
+// ConstantFolderVisitor traverses the HLO computation and reduces certain
+// constant graph sections, to literals.
+class ConstantFolderVisitor : public DfsHloVisitorWithDefault {
+ public:
+  // Default visitor action is to do nothing and return OK.
+  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return Status::OK();
+  }
+
+  Status HandleConcatenate(
+      HloInstruction* concatenate,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+
+  Status HandleConvert(HloInstruction* convert,
+                       HloInstruction* operand) override;
+
+  Status HandleReshape(HloInstruction* reshape) override;
+
+  Status HandleSlice(HloInstruction* slice, HloInstruction* operand) override;
+
+  Status HandleTranspose(HloInstruction* transpose) override;
+
+  // Returns whether a constant folding operation has occurred.
+  const bool changed() const { return changed_; }
+
+  // Runs the visitor on a computation and returns whether any changes were
+  // performed.
+  static StatusOr<bool> Run(HloComputation* computation);
+
+ private:
+  ConstantFolderVisitor() = default;
+
+  // Replaces the existing HLO instruction old_instruction, with a literal,
+  // and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceWithConstant(HloInstruction* old_instruction,
+                             std::unique_ptr<Literal> literal) {
+    TF_RETURN_IF_ERROR(old_instruction->parent()->ReplaceWithNewInstruction(
+        old_instruction, HloInstruction::CreateConstant(std::move(literal))));
+    changed_ = true;
+    return Status::OK();
+  }
+
+  // Whether any constant folding operations have occurred.
+  bool changed_ = false;
+};
+
+StatusOr<bool> ConstantFolderVisitor::Run(HloComputation* computation) {
+  ConstantFolderVisitor visitor;
+  TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  return visitor.changed();
+}
 
 StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
+  XLA_VLOG_LINES(2,
+                 "HloConstantFolding::Run(), before:\n" + module->ToString());
   bool changed = false;
-  for (auto& computation : module->computations()) {
-    for (auto instruction : computation->MakeInstructionPostOrder()) {
-      // Skip dead code.
-      if (instruction->user_count() == 0 &&
-          computation->root_instruction() != instruction) {
-        continue;
-      }
-      // Depending on the opcode, choose how to handle constant operands.
-      //
-      // TODO(b/35975797): Fold constant computations for more than reshapes and
-      // transposes.
-      switch (instruction->opcode()) {
-        case HloOpcode::kReshape: {
-          if (instruction->operand(0)->opcode() == HloOpcode::kConstant) {
-            TF_ASSIGN_OR_RETURN(
-                auto reshaped_literal,
-                LiteralUtil::Reshape(
-                    instruction->operand(0)->literal(),
-                    AsInt64Slice(instruction->shape().dimensions())));
-            TF_CHECK_OK(computation->ReplaceWithNewInstruction(
-                instruction,
-                HloInstruction::CreateConstant(std::move(reshaped_literal))));
-            changed = true;
-          }
-          break;
-        }
-        case HloOpcode::kTranspose: {
-          if (instruction->operand(0)->opcode() == HloOpcode::kConstant) {
-            auto transposed_literal = LiteralUtil::Transpose(
-                instruction->operand(0)->literal(), instruction->dimensions());
-            TF_CHECK_OK(computation->ReplaceWithNewInstruction(
-                instruction,
-                HloInstruction::CreateConstant(std::move(transposed_literal))));
-            changed = true;
-          }
-          break;
-        }
-        default:
-          break;
+  for (auto& comp : module->computations()) {
+    TF_ASSIGN_OR_RETURN(bool result, ConstantFolderVisitor::Run(comp.get()));
+    changed = changed || result;
+  }
+  XLA_VLOG_LINES(2, "HloConstantFolding::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+Status ConstantFolderVisitor::HandleReshape(HloInstruction* reshape) {
+  if (reshape->operand(0)->opcode() == HloOpcode::kConstant) {
+    TF_ASSIGN_OR_RETURN(
+        auto reshaped_literal,
+        LiteralUtil::Reshape(reshape->operand(0)->literal(),
+                             AsInt64Slice(reshape->shape().dimensions())));
+    return ReplaceWithConstant(reshape, std::move(reshaped_literal));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleTranspose(HloInstruction* transpose) {
+  if (transpose->operand(0)->opcode() == HloOpcode::kConstant) {
+    auto transposed_literal = LiteralUtil::Transpose(
+        transpose->operand(0)->literal(), transpose->dimensions());
+    return ReplaceWithConstant(transpose, std::move(transposed_literal));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleConcatenate(
+    HloInstruction* concatenate,
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  if (operands[0]->opcode() == HloOpcode::kConstant) {
+    // If all the operands of a concatenate are constant, fold them into a
+    // single constant tensor.
+    // The result concatenate dimension is going to be the sum of all the
+    // concatenate dimensions of the arrays taking part of the operation.
+    int64 concat_dim = concatenate->dimensions()[0];
+    const Shape& reference_shape = operands[0]->shape();
+    CHECK(!ShapeUtil::IsTuple(reference_shape));
+    int64 rank = ShapeUtil::Rank(reference_shape);
+    std::vector<int64> concat_dimensions(reference_shape.dimensions().begin(),
+                                         reference_shape.dimensions().end());
+    if (concat_dim < 0) {
+      concat_dim += rank;
+    }
+    for (int64 i = 1; i < operands.size(); ++i) {
+      const Shape& operand_shape = operands[i]->shape();
+      CHECK(!ShapeUtil::IsTuple(operand_shape));
+      if (operands[i]->opcode() != HloOpcode::kConstant) {
+        return Status::OK();
       }
+      // Accumulate the concat dimension from all tensors taking part to the
+      // operation.
+      concat_dimensions[concat_dim] +=
+          ShapeUtil::GetDimension(operand_shape, concat_dim);
+    }
+
+    auto literal = LiteralUtil::CreateFromDimensions(
+        reference_shape.element_type(), concat_dimensions);
+    std::vector<int64> source_indices(rank, 0);
+    std::vector<int64> dest_indices(concat_dimensions.size(), 0);
+    for (auto operand : operands) {
+      const Shape& operand_shape = operand->shape();
+      TF_RETURN_IF_ERROR(LiteralUtil::Copy(
+          operand->literal(), source_indices, literal.get(), dest_indices,
+          AsInt64Slice(operand_shape.dimensions())));
+      dest_indices[concat_dim] +=
+          ShapeUtil::GetDimension(operand_shape, concat_dim);
     }
+    return ReplaceWithConstant(concatenate, std::move(literal));
   }
-  return changed;
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleSlice(HloInstruction* slice,
+                                          HloInstruction* operand) {
+  if (operand->opcode() == HloOpcode::kConstant) {
+    const Shape& shape = slice->shape();
+    auto literal = LiteralUtil::CreateFromDimensions(
+        shape.element_type(), AsInt64Slice(shape.dimensions()));
+    std::vector<int64> dest_indices(slice->slice_starts().size(), 0);
+    TF_RETURN_IF_ERROR(LiteralUtil::Copy(
+        operand->literal(), slice->slice_starts(), literal.get(), dest_indices,
+        AsInt64Slice(shape.dimensions())));
+    TF_RETURN_IF_ERROR(ReplaceWithConstant(slice, std::move(literal)));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleConvert(HloInstruction* convert,
+                                            HloInstruction* operand) {
+  if (operand->opcode() == HloOpcode::kConstant) {
+    const Literal& src_literal = operand->literal();
+    std::unique_ptr<Literal> new_constant =
+        ConvertIfSrcTypeMatches(src_literal, convert->shape().element_type());
+    return ReplaceWithConstant(convert, std::move(new_constant));
+  }
+  return Status::OK();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.h b/tensorflow/compiler/xla/service/hlo_constant_folding.h
index 514bb8164c1e1fa10a36ceeeac63dc946de2ab5a..f45eccf825389609323eed9c5180dc385edc3092 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.h
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.h
@@ -25,12 +25,10 @@ namespace xla {
 // computation on constants.
 class HloConstantFolding : public HloPassInterface {
  public:
-  explicit HloConstantFolding() {}
-  ~HloConstantFolding() override {}
   tensorflow::StringPiece name() const override { return "constant_folding"; }
 
-  // Run ConstantFolding on the given module. Returns whether the module was
-  // changed (common subexpressions were found and eliminated).
+  // Run constant folding operations on the given module. Returns whether the
+  // module was changed (constant expressions folded).
   StatusOr<bool> Run(HloModule* module) override;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a56225da156dfc0a44b6a4b99191a3c7e706561f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+using HloConstantFoldingTest = HloTestBase;
+
+TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
+                computation->root_instruction()->literal()),
+            42);
+}
+
+TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
+                computation->root_instruction()->literal()),
+            42.0f);
+}
+
+TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<float>({42.0f, 19.0f})));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(
+      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {0}),
+      42);
+  EXPECT_EQ(
+      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {1}),
+      19);
+}
+
+TEST_F(HloConstantFoldingTest, Concatenate) {
+  const struct TestConfig {
+    int concat_dimension;
+    tensorflow::gtl::ArraySlice<int64> dimensions;
+    tensorflow::gtl::ArraySlice<int64> concat_sizes;
+  } test_configs[] = {
+      {1, {11, 0, 7, 5, 9}, {2, 5, 7, 11}},
+      {3, {1, 4, 17, 0, 8}, {1, 3, 9, 12}},
+  };
+
+  for (auto& test_config : test_configs) {
+    HloComputation::Builder builder(TestName());
+    std::vector<int64> dimensions(test_config.dimensions.begin(),
+                                  test_config.dimensions.end());
+    int64 concat_size = 0;
+    std::vector<HloInstruction*> operands;
+    for (auto csize : test_config.concat_sizes) {
+      dimensions[test_config.concat_dimension] = csize;
+      concat_size += csize;
+      auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions);
+      HloInstruction* insn = builder.AddInstruction(
+          HloInstruction::CreateConstant(std::move(literal)));
+      operands.push_back(insn);
+    }
+    dimensions[test_config.concat_dimension] = concat_size;
+    Shape shape = ShapeUtil::MakeShape(F32, dimensions);
+    builder.AddInstruction(HloInstruction::CreateConcatenate(
+        shape, operands, test_config.concat_dimension));
+    auto module = MakeUnique<HloModule>(TestName());
+    auto computation = module->AddEntryComputation(builder.Build());
+
+    HloConstantFolding const_folder;
+    TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+    EXPECT_TRUE(result);
+
+    HloInstruction* root = computation->root_instruction();
+    EXPECT_THAT(root, op::Constant());
+    EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+  }
+}
+
+TEST_F(HloConstantFoldingTest, Slice) {
+  HloComputation::Builder builder(TestName());
+  const int64 dimensions[] = {11, 8, 7, 5, 9};
+  const int64 slice_start[] = {4, 2, 3, 1, 5};
+  const int64 slice_limits[] = {10, 8, 6, 5, 9};
+  TF_ASSIGN_OR_ASSERT_OK(auto literal,
+                         LiteralTestUtil::CreateRandomLiteral<F32>(
+                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  HloInstruction* literal_instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      shape, literal_instruction, slice_start, slice_limits));
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+}
+
+TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
+  HloComputation::Builder builder(TestName());
+  const int64 dimensions[] = {11, 8, 7, 5, 9};
+  TF_ASSIGN_OR_ASSERT_OK(auto literal,
+                         LiteralTestUtil::CreateRandomLiteral<F32>(
+                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  auto literal_clone = LiteralUtil::CloneToUnique(*literal);
+  HloInstruction* literal_instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  Shape shape = ShapeUtil::MakeShape(F32, {8, 7, 11, 9, 5});
+  const int64 permutation[] = {1, 2, 0, 4, 3};
+  builder.AddInstruction(
+      HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), shape));
+
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
+  bool matched = true;
+  LiteralUtil::EachCell<NativeT>(
+      root->literal(),
+      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
+        std::vector<int64> rindexes = Permute(permutation, indices);
+        matched = matched && (value == LiteralUtil::Get<NativeT>(*literal_clone,
+                                                                 rindexes));
+      });
+  EXPECT_TRUE(matched);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index ec8161f55fd56c95bb088a0c539255aed2fe6993..9444382b5270b0f76fa33b598297d24572e5b2c9 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -36,6 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/types.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -88,13 +91,15 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_NE(add->operand(0), add->operand(1));
+  EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_EQ(add->operand(0), add->operand(1));
+  auto first_operand = add->operand(0);
+  EXPECT_THAT(first_operand, ::testing::AnyOf(constant1, constant2));
+  EXPECT_THAT(add, op::Add(first_operand, first_operand));
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
@@ -118,15 +123,13 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(constant1, add->operand(0));
-  EXPECT_EQ(constant2, add->operand(1));
+  EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
   EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(constant1, add->operand(0));
-  EXPECT_EQ(constant2, add->operand(1));
+  EXPECT_THAT(add, op::Add(constant1, constant2));
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
@@ -185,16 +188,18 @@ TEST_F(HloCseTest, NonscalarConstants) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
+  EXPECT_THAT(tuple,
+              op::Tuple(common_constant1, common_constant2, uncommon_constant));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
-  EXPECT_EQ(uncommon_constant, tuple->operand(2));
-  EXPECT_TRUE(tuple->operand(0) == common_constant1 ||
-              tuple->operand(0) == common_constant2);
+  auto first_operand = tuple->operand(0);
+  EXPECT_THAT(first_operand,
+              ::testing::AnyOf(common_constant1, common_constant2));
+  EXPECT_THAT(tuple,
+              op::Tuple(first_operand, first_operand, uncommon_constant));
 }
 
 TEST_F(HloCseTest, IdenticalInstructions) {
@@ -215,16 +220,15 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
-  EXPECT_NE(tuple->operand(1), tuple->operand(2));
-  EXPECT_NE(tuple->operand(0), tuple->operand(2));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2, exp3));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
-  EXPECT_EQ(tuple->operand(1), tuple->operand(2));
+  auto first_operand = tuple->operand(0);
+  EXPECT_THAT(first_operand, ::testing::AnyOf(exp1, exp2, exp3));
+  EXPECT_THAT(tuple, op::Tuple(first_operand, first_operand, first_operand));
 }
 
 TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
@@ -249,13 +253,13 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
   EXPECT_FALSE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 }
 
 TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
@@ -280,13 +284,15 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
+  auto first_operand = tuple->operand(0);
+  EXPECT_THAT(first_operand, ::testing::AnyOf(exp1, exp2));
+  EXPECT_THAT(tuple, op::Tuple(first_operand, first_operand));
 }
 
 TEST_F(HloCseTest, IdenticalExpressions) {
@@ -328,14 +334,15 @@ TEST_F(HloCseTest, IdenticalExpressions) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(8, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(op::Add(negate1, exp1), op::Add(negate2, exp2)));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(5, computation->instruction_count());
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
-  EXPECT_EQ(HloOpcode::kAdd, tuple->operand(0)->opcode());
+  auto operand = tuple->operand(0);
+  EXPECT_THAT(tuple, op::Tuple(operand, operand));
+  EXPECT_THAT(operand, op::Add(op::Negate(), op::Exp()));
 }
 
 TEST_F(HloCseTest, DoNotCombineRng) {
@@ -351,12 +358,16 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   auto rng2 = builder.AddInstruction(HloInstruction::CreateRng(
       ShapeUtil::MakeShape(F32, {}), RandomDistribution::RNG_UNIFORM,
       {constant1, constant2}));
+
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, rng1, rng2));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(rng1, rng2));
+
   uint32 count_before = computation->instruction_count();
 
   HloCSE cse(/*is_layout_sensitive=*/false);
@@ -364,11 +375,8 @@ TEST_F(HloCseTest, DoNotCombineRng) {
 
   uint32 count_after = computation->instruction_count();
   EXPECT_EQ(count_before, count_after);
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kRng);
-  EXPECT_EQ(root->operand(1)->opcode(), HloOpcode::kRng);
-  EXPECT_NE(root->operand(0), root->operand(1));
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(rng1, rng2));
 }
 
 // TODO(b/28245743): Handle impure functions correctly in CSE.
@@ -412,16 +420,17 @@ TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
   }
 
   EXPECT_EQ(4, computation->instruction_count());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Map(), op::Map()));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(4, computation->instruction_count());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kMap);
-  EXPECT_EQ(root->operand(1)->opcode(), HloOpcode::kMap);
-  EXPECT_NE(root->operand(0), root->operand(1));
+  root = computation->root_instruction();
+  auto operand = root->operand(0)->operand(0);
+  EXPECT_THAT(operand, op::Map());
+  EXPECT_THAT(root, op::Add(operand, operand));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index ebe74280525010436423163f746ddee6a23dc7e1..e0447d69aa2229e2cb391aac8b2afa8fde6145c1 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -26,342 +26,532 @@ limitations under the License.
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/bitmap.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-namespace {
+template <typename ReturnT>
+class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit TypedVisitor(HloEvaluator* p) : parent_(p) {}
 
-template <typename NativeT>
-std::unique_ptr<Literal> ElementWiseUnaryOp(
-    const Shape& shape, std::function<NativeT(NativeT)>&& unary_op,
-    const Literal& operand) {
-  DCHECK(ShapeUtil::SameDimensions(shape, operand.shape()));
+  Status DefaultAction(HloInstruction* hlo_instruction) override {
+    return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
+                         HloOpcodeString(hlo_instruction->opcode()).c_str());
+  };
 
-  auto result = MakeUnique<Literal>();
-  *result->mutable_shape() = shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(shape), result.get());
+  // TODO(b/35950897): many of the stl functions used in the handlers are not
+  // overloaded for every XLA primitive types.
 
-  std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
-  do {
-    LiteralUtil::Set<NativeT>(
-        result.get(), multi_index,
-        unary_op(LiteralUtil::Get<NativeT>(operand, multi_index)));
-  } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
+  template <typename NativeT,
+            typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
+                nullptr>
+  Status HandleAbs(HloInstruction* abs, HloInstruction* operand) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return elem_operand;
+                        }));
+    return Status::OK();
+  };
 
-  return result;
-}
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_signed<NativeT>::value>::type* = nullptr>
+  Status HandleAbs(HloInstruction* abs, HloInstruction* operand) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return std::abs(elem_operand);
+                        }));
+    return Status::OK();
+  };
 
-template <typename NativeT>
-std::unique_ptr<Literal> ElementWiseBinaryOp(
-    const Shape& shape, std::function<NativeT(NativeT, NativeT)>&& binary_op,
-    const Literal& lhs, const Literal& rhs) {
-  DCHECK(ShapeUtil::SameDimensions(shape, rhs.shape()));
-  DCHECK(ShapeUtil::SameDimensions(lhs.shape(), rhs.shape()));
-
-  auto result = MakeUnique<Literal>();
-  *result->mutable_shape() = shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(shape), result.get());
-
-  std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
-  do {
-    LiteralUtil::Set<NativeT>(
-        result.get(), multi_index,
-        binary_op(LiteralUtil::Get<NativeT>(lhs, multi_index),
-                  LiteralUtil::Get<NativeT>(rhs, multi_index)));
-  } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
-
-  return result;
-}
+  Status HandleAbs(HloInstruction* abs, HloInstruction* operand) override {
+    return HandleAbs<ReturnT>(abs, operand);
+  };
 
-template <typename NativeT, typename LhsType, typename RhsType,
-          typename EhsType>
-std::unique_ptr<Literal> ElementWiseTernaryOp(
-    const Shape& shape,
-    std::function<NativeT(LhsType lhs, RhsType rhs, EhsType ehs)>&& ternary_op,
-    const Literal& lhs, const Literal& rhs, const Literal& ehs) {
-  DCHECK(ShapeUtil::SameDimensions(shape, lhs.shape()));
-  DCHECK(ShapeUtil::SameDimensions(lhs.shape(), rhs.shape()));
-  DCHECK(ShapeUtil::SameDimensions(rhs.shape(), ehs.shape()));
-
-  auto result = MakeUnique<Literal>();
-  *result->mutable_shape() = shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(shape), result.get());
-
-  std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
-  do {
-    LiteralUtil::Set<NativeT>(
-        result.get(), multi_index,
-        ternary_op(LiteralUtil::Get<LhsType>(lhs, multi_index),
-                   LiteralUtil::Get<RhsType>(rhs, multi_index),
-                   LiteralUtil::Get<EhsType>(ehs, multi_index)));
-  } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
-
-  return result;
-}
+  Status HandleCeil(HloInstruction* ceil, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
+                        ElementWiseUnaryOp(ceil, [](ReturnT elem_operand) {
+                          return std::ceil(elem_operand);
+                        }));
+    return Status::OK();
+  };
 
-// Templated abs so that unsigned types can be passed in without warning.
-template <
-    typename NativeT,
-    typename std::enable_if<std::is_unsigned<NativeT>::value>::type* = nullptr>
-NativeT AbsoluteVal(NativeT value) {
-  return value;
-}
+  Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[copy],
+                        ElementWiseUnaryOp(copy, [](ReturnT elem_operand) {
+                          return elem_operand;
+                        }));
+    return Status::OK();
+  };
 
-template <
-    typename NativeT,
-    typename std::enable_if<std::is_signed<NativeT>::value>::type* = nullptr>
-NativeT AbsoluteVal(NativeT value) {
-  return std::abs(value);
-}
+  Status HandleExp(HloInstruction* exp, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
+                        ElementWiseUnaryOp(exp, [](ReturnT elem_operand) {
+                          return std::exp(elem_operand);
+                        }));
+    return Status::OK();
+  };
 
-template <typename NativeT>
-StatusOr<std::unique_ptr<Literal>> EvaluateOpForLiteralInternal(
-    HloInstruction* instruction) {
-  DCHECK(hlo_query::AllOperandsAreConstants(*instruction));
-
-  const std::vector<HloInstruction*>& operands = instruction->operands();
-  HloOpcode opcode = instruction->opcode();
-  const Shape& shape = instruction->shape();
-
-  switch (opcode) {
-    // TODO(b/35950897): many of the stl function used here are not overloaded
-    // for all XLA primitive types.
-    // Unary element-wise ops.
-    case HloOpcode::kAbs:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return AbsoluteVal(operand); },
-          operands[0]->literal());
-    case HloOpcode::kCeil:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return std::ceil(operand); },
-          operands[0]->literal());
-    case HloOpcode::kConvert:
-      CHECK_EQ(operands.size(), 1);
-      // TODO(b/35950897): implement Convert.
-      return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
-                           HloOpcodeString(opcode).c_str());
-    case HloOpcode::kCopy:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return operand; },
-          operands[0]->literal());
-    case HloOpcode::kExp:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return std::exp(operand); },
-          operands[0]->literal());
-    case HloOpcode::kFloor:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return std::floor(operand); },
-          operands[0]->literal());
-    case HloOpcode::kIsFinite:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return std::isfinite(operand); },
-          operands[0]->literal());
-    case HloOpcode::kLog:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return std::log(operand); },
-          operands[0]->literal());
-    case HloOpcode::kLogicalNot:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return !operand; },
-          operands[0]->literal());
-    case HloOpcode::kNegate:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return -operand; },
-          operands[0]->literal());
-    case HloOpcode::kSign:
-      CHECK_EQ(operands.size(), 1);
-      CHECK(primitive_util::IsIntegralType(shape.element_type()));
-      return ElementWiseUnaryOp<int>(shape,
-                                     [](NativeT operand) {
-                                       return (NativeT(0) < operand) -
-                                              (operand < NativeT(0));
-                                     },
-                                     operands[0]->literal());
-    case HloOpcode::kTanh:
-      CHECK_EQ(operands.size(), 1);
-      return ElementWiseUnaryOp<NativeT>(
-          shape, [](NativeT operand) { return std::tanh(operand); },
-          operands[0]->literal());
-    // Binary element-wise ops.
-    case HloOpcode::kAdd:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs + rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kDivide:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs / rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kMultiply:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs * rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kSubtract:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs - rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kEq:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<bool>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs == rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kGe:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<bool>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs >= rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kGt:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<bool>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs > rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kLe:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<bool>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs <= rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kLt:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<bool>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs < rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kNe:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<bool>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs != rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kMaximum:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape, [](NativeT lhs, NativeT rhs) { return std::max(lhs, rhs); },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kMinimum:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape, [](NativeT lhs, NativeT rhs) { return std::min(lhs, rhs); },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kPower:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape, [](NativeT lhs, NativeT rhs) { return std::pow(lhs, rhs); },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kRemainder:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape,
-          [](NativeT lhs, NativeT rhs) { return std::remainder(lhs, rhs); },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kLogicalAnd:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs && rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    case HloOpcode::kLogicalOr:
-      CHECK_EQ(operands.size(), 2);
-      return ElementWiseBinaryOp<NativeT>(
-          shape, [](NativeT lhs, NativeT rhs) { return lhs || rhs; },
-          operands[0]->literal(), operands[1]->literal());
-    // Ternary element-wise ops.
-    case HloOpcode::kClamp: {
-      CHECK_EQ(operands.size(), 3);
-      std::function<NativeT(NativeT, NativeT, NativeT)> clamp_op =
-          [](NativeT low, NativeT high, NativeT value) {
-            return std::max(low, std::min(value, high));
-          };
-      return ElementWiseTernaryOp<NativeT, NativeT, NativeT, NativeT>(
-          shape, std::move(clamp_op), operands[0]->literal(),
-          operands[1]->literal(), operands[2]->literal());
-    } break;
-    case HloOpcode::kSelect: {
-      CHECK_EQ(operands.size(), 3);
-      CHECK(!ShapeUtil::IsTuple(shape));
-      std::function<NativeT(bool, NativeT, NativeT)> select_op =
-          [](bool pred, NativeT on_true, NativeT on_false) {
-            if (pred) {
-              return on_true;
-            }
-            return on_false;
-          };
-      return ElementWiseTernaryOp<NativeT, bool, NativeT, NativeT>(
-          shape, std::move(select_op), operands[0]->literal(),
-          operands[1]->literal(), operands[2]->literal());
-    } break;
-    default:
-      return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
-                           HloOpcodeString(opcode).c_str());
-  }
+  Status HandleFloor(HloInstruction* floor, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[floor],
+                        ElementWiseUnaryOp(floor, [](ReturnT elem_operand) {
+                          return std::floor(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleIsFinite(HloInstruction* is_finite,
+                        HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[is_finite],
+                        ElementWiseUnaryOp(is_finite, [](ReturnT elem_operand) {
+                          return std::isfinite(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleLog(HloInstruction* log, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
+                        ElementWiseUnaryOp(log, [](ReturnT elem_operand) {
+                          return std::log(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleLogicalNot(HloInstruction* logical_not,
+                          HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logical_not],
+        ElementWiseUnaryOp(logical_not,
+                           [](ReturnT elem_operand) { return !elem_operand; }));
+    return Status::OK();
+  };
+
+  Status HandleNegate(HloInstruction* negate,
+                      HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[negate],
+                        ElementWiseUnaryOp(negate, [](ReturnT elem_operand) {
+                          return -elem_operand;
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleSign(HloInstruction* sign, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ReturnT elem_operand) {
+                          return (ReturnT(0) < elem_operand) -
+                                 (elem_operand < ReturnT(0));
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleTanh(HloInstruction* tanh, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
+                        ElementWiseUnaryOp(tanh, [](ReturnT elem_operand) {
+                          return std::tanh(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleMultiply(HloInstruction* multiply, HloInstruction* lhs,
+                        HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(multiply, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem * rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleSubtract(HloInstruction* subtract, HloInstruction* lhs,
+                        HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[subtract],
+        ElementWiseBinaryOp(subtract, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem - rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleAdd(HloInstruction* add, HloInstruction* lhs,
+                   HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[add],
+        ElementWiseBinaryOp(add, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem + rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleDivide(HloInstruction* divide, HloInstruction* lhs,
+                      HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[divide],
+        ElementWiseBinaryOp(divide, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem / rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleCompare(HloInstruction* compare, HloOpcode opcode,
+                       HloInstruction* lhs, HloInstruction* rhs) override {
+    std::function<bool(ReturnT, ReturnT)> compare_op;
+    switch (opcode) {
+      case HloOpcode::kEq:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el == rhs_el;
+        };
+        break;
+      case HloOpcode::kNe:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el != rhs_el;
+        };
+        break;
+      case HloOpcode::kGe:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el >= rhs_el;
+        };
+        break;
+      case HloOpcode::kGt:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el > rhs_el;
+        };
+        break;
+      case HloOpcode::kLe:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el <= rhs_el;
+        };
+        break;
+      case HloOpcode::kLt:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el < rhs_el;
+        };
+        break;
+      default:
+        LOG(FATAL) << "unhandled HLO opcode for conversion to Comparison: "
+                   << HloOpcodeString(opcode);
+    }
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!(ShapeUtil::SameDimensions(compare->shape(), rhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
+      return Unimplemented(
+          "Compare operation with mismatched dimensions, likely due to "
+          "broadcasting is unsupported.");
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    auto result = LiteralUtil::CreateFromShape(compare->shape());
+    std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
+    do {
+      LiteralUtil::Set<bool>(
+          result.get(), multi_index,
+          compare_op(LiteralUtil::Get<ReturnT>(lhs_literal, multi_index),
+                     LiteralUtil::Get<ReturnT>(rhs_literal, multi_index)));
+    } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
+
+    parent_->evaluated_[compare] = std::move(result);
+
+    return Status::OK();
+  };
+
+  Status HandleMaximum(HloInstruction* maximum, HloInstruction* lhs,
+                       HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[maximum],
+        ElementWiseBinaryOp(maximum, [](ReturnT lhs, ReturnT rhs) {
+          return std::max(lhs, rhs);
+        }));
+    return Status::OK();
+  };
+
+  Status HandleMinimum(HloInstruction* minimum, HloInstruction* lhs,
+                       HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[minimum],
+        ElementWiseBinaryOp(minimum, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return std::min(lhs_el, rhs_el);
+        }));
+    return Status::OK();
+  };
+
+  Status HandlePower(HloInstruction* power, HloInstruction* lhs,
+                     HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[power],
+        ElementWiseBinaryOp(power, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return std::pow(lhs_el, rhs_el);
+        }));
+    return Status::OK();
+  };
+
+  Status HandleRemainder(HloInstruction* remainder, HloInstruction* lhs,
+                         HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[remainder],
+        ElementWiseBinaryOp(remainder, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return std::remainder(lhs_el, rhs_el);
+        }));
+    return Status::OK();
+  };
+
+  Status HandleLogicalAnd(HloInstruction* logical_and, HloInstruction* lhs,
+                          HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logical_and],
+        ElementWiseBinaryOp(logical_and, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el && rhs_el;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleLogicalOr(HloInstruction* logical_or, HloInstruction* lhs,
+                         HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logical_or],
+        ElementWiseBinaryOp(logical_or, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el || rhs_el;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleClamp(HloInstruction* clamp, HloInstruction* min,
+                     HloInstruction* arg, HloInstruction* max) override {
+    std::function<ReturnT(ReturnT, ReturnT, ReturnT)> clamp_op =
+        [](ReturnT low, ReturnT high, ReturnT value) {
+          return std::max(low, std::min(value, high));
+        };
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clamp],
+                        ElementWiseTernaryOp(clamp, std::move(clamp_op)));
+    return Status::OK();
+  };
+
+  Status HandleSelect(HloInstruction* select, HloInstruction* pred,
+                      HloInstruction* on_true,
+                      HloInstruction* on_false) override {
+    CHECK(!ShapeUtil::IsTuple(select->shape()));
+    std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
+        [](bool pred, ReturnT on_true, ReturnT on_false) {
+          if (pred) {
+            return on_true;
+          }
+          return on_false;
+        };
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
+                        ElementWiseTernaryOp(select, std::move(select_op)));
+    return Status::OK();
+  };
+
+  Status Preprocess(HloInstruction* hlo) override {
+    VLOG(2) << hlo->ToString();
+    return Status::OK();
+  };
+
+ private:
+  StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(ReturnT)>& unary_op) {
+    const auto shape = instruction->shape();
+    const auto* operand = instruction->operand(0);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(operand->shape()).c_str());
+    }
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+
+    auto result = LiteralUtil::CreateFromShape(shape);
+
+    std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
+    do {
+      LiteralUtil::Set<ReturnT>(
+          result.get(), multi_index,
+          unary_op(LiteralUtil::Get<ReturnT>(operand_literal, multi_index)));
+    } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
+
+    return std::move(result);
+  };
+
+  StatusOr<std::unique_ptr<Literal>> ElementWiseBinaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(ReturnT, ReturnT)>& binary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    auto result = LiteralUtil::CreateFromShape(shape);
+    std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
+    do {
+      LiteralUtil::Set<ReturnT>(
+          result.get(), multi_index,
+          binary_op(LiteralUtil::Get<ReturnT>(lhs_literal, multi_index),
+                    LiteralUtil::Get<ReturnT>(rhs_literal, multi_index)));
+    } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
+
+    return std::move(result);
+  };
+
+  template <typename LhsType, typename RhsType, typename EhsType>
+  StatusOr<std::unique_ptr<Literal>> ElementWiseTernaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+    const auto* ehs = instruction->operand(2);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
+          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str(),
+          ShapeUtil::HumanString(ehs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
+
+    auto result = LiteralUtil::CreateFromShape(shape);
+    std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
+    do {
+      LiteralUtil::Set<ReturnT>(
+          result.get(), multi_index,
+          ternary_op(LiteralUtil::Get<LhsType>(lhs_literal, multi_index),
+                     LiteralUtil::Get<RhsType>(rhs_literal, multi_index),
+                     LiteralUtil::Get<EhsType>(ehs_literal, multi_index)));
+    } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
+
+    return std::move(result);
+  };
+
+  HloEvaluator* parent_;
+};
+
+HloEvaluator::HloEvaluator() {
+  typed_visitors_[PRED] = MakeUnique<TypedVisitor<bool>>(this);
+  typed_visitors_[U8] = MakeUnique<TypedVisitor<uint8>>(this);
+  typed_visitors_[U16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: U16.");
+  });
+  typed_visitors_[U32] = MakeUnique<TypedVisitor<uint32>>(this);
+  typed_visitors_[U64] = MakeUnique<TypedVisitor<uint64>>(this);
+  typed_visitors_[S8] = MakeUnique<TypedVisitor<int8>>(this);
+  typed_visitors_[S16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: S16.");
+  });
+  typed_visitors_[S32] = MakeUnique<TypedVisitor<int32>>(this);
+  typed_visitors_[S64] = MakeUnique<TypedVisitor<int64>>(this);
+  typed_visitors_[F16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: F16.");
+  });
+  typed_visitors_[F32] = MakeUnique<TypedVisitor<float>>(this);
+  typed_visitors_[F64] = MakeUnique<TypedVisitor<double>>(this);
 }
 
-}  // namespace
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
+    HloComputation* computation,
+    tensorflow::gtl::ArraySlice<const Literal*> args) {
+  arg_literals_ = args;
+  evaluated_.clear();
 
-/* static */ StatusOr<std::unique_ptr<Literal>>
-HloEvaluator::EvaluateOpForLiteral(HloInstruction* instruction) {
-  DCHECK(hlo_query::AllOperandsAreConstants(*instruction));
+  TF_RETURN_IF_ERROR(computation->Accept(this));
+  return std::move(FindOrDie(evaluated_, computation->root_instruction()));
+}
 
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
+    HloInstruction* instruction,
+    tensorflow::gtl::ArraySlice<const Literal*> operands) {
+  DCHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
   Shape shape = instruction->shape();
   TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
 
-  // REVIEW QUESTION: other than a few operations, do we need to handle the
-  // general case of operands being of different types in the context of the
-  // evaluator?
-
-  switch (shape.element_type()) {
-    case PRED:
-      return EvaluateOpForLiteralInternal<bool>(instruction);
-    case U8:
-      return EvaluateOpForLiteralInternal<uint8>(instruction);
-    case U16:
-      LOG(FATAL) << "U16/uint16 is unimplemented.";
-    case U32:
-      return EvaluateOpForLiteralInternal<uint32>(instruction);
-    case U64:
-      return EvaluateOpForLiteralInternal<uint64>(instruction);
-    case S8:
-      return EvaluateOpForLiteralInternal<int8>(instruction);
-    case S16:
-      LOG(FATAL) << "S16/int16 is unimplemented.";
-    case S32:
-      return EvaluateOpForLiteralInternal<int32>(instruction);
-    case S64:
-      return EvaluateOpForLiteralInternal<int64>(instruction);
-    case F16:
-      LOG(FATAL) << "F16 is unimplemented.";
-    case F32:
-      return EvaluateOpForLiteralInternal<float>(instruction);
-    case F64:
-      return EvaluateOpForLiteralInternal<double>(instruction);
-    default:
-      return Unimplemented("unhandled primitive type: %s.",
-                           PrimitiveType_Name(shape.element_type()).c_str());
+  arg_literals_ = operands;
+  evaluated_.clear();
+
+  // Evaluate operands of Parameter type against the input literals which
+  // caches the evaluated literal results.
+  for (const auto operand : instruction->operands()) {
+    if (operand->opcode() == HloOpcode::kParameter) {
+      const Literal* input_literal = arg_literals_[operand->parameter_number()];
+      VLOG(2) << "Parameter operand evaluated to: "
+              << LiteralUtil::ToString(*input_literal);
+      TF_RET_CHECK(ShapeUtil::Equal(operand->shape(), input_literal->shape()));
+
+      evaluated_[operand] = MakeUnique<Literal>(*input_literal);
+    } else if (operand->opcode() == HloOpcode::kConstant) {
+      evaluated_[operand] = MakeUnique<Literal>(operand->literal());
+    }
   }
+
+  TF_RETURN_IF_ERROR(instruction->Visit(this));
+  return std::move(FindOrDie(evaluated_, instruction));
+}
+
+Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
+  VLOG(2) << "HandleParameter: " << parameter->ToString();
+  const Literal* input_literal = arg_literals_[parameter->parameter_number()];
+  VLOG(2) << "Parameter evaluated to: "
+          << LiteralUtil::ToString(*input_literal);
+  DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()));
+
+  evaluated_[parameter] = MakeUnique<Literal>(*input_literal);
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleConstant(HloInstruction* constant,
+                                    const Literal& literal) {
+  VLOG(2) << "HandleConstant: " << constant->ToString();
+  DCHECK(ShapeUtil::Equal(constant->shape(), literal.shape()));
+
+  evaluated_[constant] = MakeUnique<Literal>(literal);
+  return Status::OK();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index c6ec650d674117f8bcbc9517a76b16c5940981d2..50cb32eb85c04d8b3abe4cd0b46a4f8c10e9c568 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -18,22 +18,105 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace xla {
 
-// Responsible for evaluating a HLO instruction with constant operands.
-class HloEvaluator {
+// Responsible for evaluating HLO and obtain literal as the evaluation results.
+//
+// This class is not thread-safe.
+class HloEvaluator : public DfsHloVisitorWithDefault {
  public:
-  // Evaluates a single HLO instruction for constants and return the result as a
-  // Literal.
-  // Precondition: all operands of the instruction are constants, instruction is
-  // valid with corresponding number of operands for the given operator.
+  HloEvaluator();
+  // Evaluates a HLO computation and an array of pointers to literals.
+  // Return the evaluated result as literal if successful.
+  // Precondition: argument literals are corresponds to the input computation's
+  // parameters in their post-ordering. For e.g., consider the following graph:
+  //
+  //                *
+  //            /       \
+  //            +     Parameter1
+  //        /      \
+  //       /        \
+  //    Parameter0  Constant
+  //
+  // The input literals array will have its first literal map to Parameter0 and
+  // the second map to Parameter1.
+  StatusOr<std::unique_ptr<Literal>> Evaluate(
+      HloComputation* computation,
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+
+  // Evaluates a single HLO instruction and an array of pointers to literals.
+  // Return the evaluated result as literal if successful.
+  // Precondition:
+  // 1. argument literals are corresponds to the input instruction's
+  // parameters in their post-orderring.
+  // 2. the instruction's operands must be of either Parameter or Constant type.
   // TODO(b/35950897): implement more ops other than element-wise ops.
-  static StatusOr<std::unique_ptr<Literal>> EvaluateOpForLiteral(
-      HloInstruction* instruction);
+  StatusOr<std::unique_ptr<Literal>> Evaluate(
+      HloInstruction* instruction,
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+
+ protected:
+  // Templated DfsHloVisitor. Typically ReturnT here indicates the resulting
+  // literal type of each evaluated Handle* method of a TypedVisitor. One
+  // exception to this is HandleCompare, where the resulting literal type is
+  // always boolean.
+  // Note the forward declaration here is necessary to enable TypedVisitor to
+  // access parent members.
+  template <typename ReturnT>
+  class TypedVisitor;
+
+  // Wraps around instruction handling to infer types before dispatching to
+  // the corresponding typed Visitor.
+  Status DefaultAction(HloInstruction* hlo) override {
+    return hlo->Visit(typed_visitors_.at(hlo->shape().element_type()).get());
+  }
+
+  Status HandleParameter(HloInstruction* parameter) override;
+
+  Status HandleConstant(HloInstruction* constant,
+                        const Literal& literal) override;
+
+ private:
+  // Returns the already-evaluated literal result for the instruction.
+  // Crash with log if the given instruction has not been evaluated previously.
+  const Literal& GetEvaluatedLiteralFor(const HloInstruction* hlo) {
+    auto it = evaluated_.find(hlo);
+    CHECK(it != evaluated_.end())
+        << "could not find evaluated value for: " << hlo->ToString();
+    return *(it->second);
+  }
+
+  // Map from a primitive type to its associated (templated) DfsHloVisitor.
+  // Note: the hash function here is only needed because current gcc std::hash
+  // does not specialize for enum types. This should however be fixed in the
+  // future: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60970#c5
+  tensorflow::gtl::FlatMap<PrimitiveType, std::unique_ptr<DfsHloVisitor>,
+                           std::hash<int>>
+      typed_visitors_;
+
+  // Tracks the HLO instruciton and its evaluated literal result.
+  // TODO(b/35950897): have better memory management here to free instructions
+  // that are no longer a parent for any other subsequent instruction in
+  // post-orderring.
+  tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<Literal>>
+      evaluated_;
+
+  // Stores input literals, assuming they are in post-order. Literals are not
+  // owned by this class, and they must outlive the lifetime of the instance of
+  // this class.
+  tensorflow::gtl::ArraySlice<const Literal*> arg_literals_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 585fe65def3fef4f271b5cfbbb500d3f7a0eba59..443e5ad4f4290ff10b867887ac5ed359a0c8f73a 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -29,9 +32,16 @@ limitations under the License.
 namespace xla {
 namespace {
 
+class HloEvaluatorTest : public ::testing::Test {
+ protected:
+  HloEvaluatorTest() { evaluator_ = MakeUnique<HloEvaluator>(); }
+
+  std::unique_ptr<HloEvaluator> evaluator_;
+};
+
 // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
 // with 3 operands.
-TEST(HloEvaluatorTest, DoesClamp) {
+TEST_F(HloEvaluatorTest, DoesClamp) {
   auto low = LiteralUtil::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
   auto high = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto value = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
@@ -44,7 +54,7 @@ TEST(HloEvaluatorTest, DoesClamp) {
       shape, HloOpcode::kClamp, c1.get(), c2.get(), c3.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<float>({{0, 4}, {2, 4}});
 
@@ -53,7 +63,7 @@ TEST(HloEvaluatorTest, DoesClamp) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
 // with 3 operands.
-TEST(HloEvaluatorTest, DoesSelect) {
+TEST_F(HloEvaluatorTest, DoesSelect) {
   auto pred = LiteralUtil::CreateR2<bool>({{true, false}, {false, true}});
   auto on_true = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto on_false = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
@@ -66,7 +76,7 @@ TEST(HloEvaluatorTest, DoesSelect) {
       shape, HloOpcode::kSelect, c1.get(), c2.get(), c3.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<float>({{2, 5}, {0, 4}});
 
@@ -75,7 +85,7 @@ TEST(HloEvaluatorTest, DoesSelect) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise addition with 2 operands.
-TEST(HloEvaluatorTest, DoesAdd) {
+TEST_F(HloEvaluatorTest, DoesAdd) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
 
@@ -86,7 +96,7 @@ TEST(HloEvaluatorTest, DoesAdd) {
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, c1.get(), c2.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-96, 8}});
 
@@ -95,7 +105,7 @@ TEST(HloEvaluatorTest, DoesAdd) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise divide with 2 operands.
-TEST(HloEvaluatorTest, DoesDivide) {
+TEST_F(HloEvaluatorTest, DoesDivide) {
   auto lhs_s64 = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs_s64 = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
 
@@ -106,7 +116,7 @@ TEST(HloEvaluatorTest, DoesDivide) {
                                                   c1_s64.get(), c2_s64.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {-25, 1}});
 
@@ -121,8 +131,7 @@ TEST(HloEvaluatorTest, DoesDivide) {
   instruction = HloInstruction::CreateBinary(shape_f64, HloOpcode::kDivide,
                                              c1_f64.get(), c2_f64.get());
 
-  result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+  result = evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   expected =
       LiteralUtil::CreateR2<double>({{0.45454545454545453, 0}, {-25, 1}});
@@ -132,21 +141,51 @@ TEST(HloEvaluatorTest, DoesDivide) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise abs op with 1 operand.
-TEST(HloEvaluatorTest, DoesAbs) {
+TEST_F(HloEvaluatorTest, DoesAbs) {
   auto operand = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
-
   Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
   auto c1 = HloInstruction::CreateConstant(std::move(operand));
   auto instruction =
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, c1.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<int64>({{1, 20}, {100, 4}});
 
   EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
 }
 
+// Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
+// constant operands.
+TEST_F(HloEvaluatorTest, DoesTraveseInstructions) {
+  HloComputation::Builder builder(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
+
+  auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto rhs2 = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
+  std::vector<const Literal*> args = {lhs.get(), rhs.get(), rhs2.get()};
+
+  Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
+
+  auto param_lhs = HloInstruction::CreateParameter(0, shape, "lhs");
+  auto param_rhs = HloInstruction::CreateParameter(1, shape, "rhs");
+  auto lhs_instruction = HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd, param_lhs.get(), param_rhs.get());
+
+  auto param_rhs2 = HloInstruction::CreateParameter(2, shape, "rhs2");
+  auto root_instruction = HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd, lhs_instruction.get(), param_rhs2.get());
+
+  builder.AddInstruction(std::move(root_instruction));
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(builder.Build().get(), args).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{4, -16}, {-196, 12}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 5ce52af5b4616050988d2dba653c23d8acedf0d8..eb2e5dfb37f33fd138e20ee930a2242cb1db89ea 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -48,6 +48,73 @@ namespace xla {
 namespace hlo_graph_dumper {
 namespace {
 
+// Node color schemes, used by NodeColorAttributes.
+enum ColorScheme {
+  kBlue,
+  kBrown,
+  kDarkBlue,
+  kDarkGreen,
+  kDarkRed,
+  kGray,
+  kGreen,
+  kOrange,
+  kPurple,
+  kRed,
+  kWhite,
+  kYellow,
+};
+
+// Given a ColorScheme, returns an attribute string for a node of that color.
+// Sets the node's fill, stroke, and text colors.
+//
+// Colors are from https://material.io/color.
+string NodeColorAttributes(ColorScheme color) {
+  using std::make_tuple;
+
+  const char *fill_color, *stroke_color, *font_color;
+  std::tie(fill_color, stroke_color, font_color) =
+      [color]() -> std::tuple<const char*, const char*, const char*> {
+    switch (color) {
+      case kBlue:
+        return make_tuple("#bbdefb", "#8aacc8", "black");
+      case kBrown:
+        return make_tuple("#bcaaa4", "#8c7b75", "black");
+      case kDarkBlue:
+        return make_tuple("#1565c0", "#003c8f", "white");
+      case kDarkGreen:
+        return make_tuple("#2e7d32", "#005005", "white");
+      case kDarkRed:
+        return make_tuple("#b71c1c", "#7f0000", "white");
+      case kGray:
+        return make_tuple("#cfd8dc", "#9ea7aa", "black");
+      case kGreen:
+        return make_tuple("#c8e6c9", "#97b498", "black");
+      case kOrange:
+        return make_tuple("#ffe0b2", "#cbae82", "black");
+      case kPurple:
+        return make_tuple("#e1bee7", "#af8eb5", "black");
+      case kRed:
+        return make_tuple("#ffcdd2", "#cb9ca1", "black");
+      case kWhite:
+        return make_tuple("white", "black", "black");
+      case kYellow:
+        return make_tuple("#fff9c4", "#cbc693", "black");
+    }
+  }();
+
+  return Printf(
+      "style=filled, fontcolor=\"%s\", color=\"%s\", fillcolor=\"%s\"",
+      font_color, stroke_color, fill_color);
+}
+
+// Replaces <> with &lt;&gt;, so that this string is safe(er) for use in a
+// graphviz HTML-like string.
+string HtmlLikeStringSanitize(tensorflow::StringPiece s) {
+  return tensorflow::str_util::StringReplace(
+      tensorflow::str_util::StringReplace(s, "<", "&lt;", /*replace_all=*/true),
+      ">", "&gt;", /*replace_all=*/true);
+}
+
 // Returns the dot graph identifier for the given instruction.
 string InstructionId(const HloInstruction* instruction) {
   return Printf("%lld", reinterpret_cast<uint64>(instruction));
@@ -102,30 +169,36 @@ string InstructionSequenceGraph(
       param_ports.push_back(
           Printf("<%s> %s", InstructionId(param).c_str(), label.c_str()));
     }
-    StrAppend(&graph_body, param_node_name,
-              " [shape=record,style=filled,fillcolor=\"lightblue1\",",
-              "label=\"{parameters | {", Join(param_ports, "|"), "}}\"];\n");
+    // (If we wanted the word "parameters" to be bold like the other op names,
+    // we'd have to make this into an HTML-like table.  It is possible but
+    // complicated; see http://www.graphviz.org/doc/info/shapes.html#html.)
+    StrAppend(&graph_body, param_node_name, " [shape=record ",
+              NodeColorAttributes(kOrange), "label=\"{parameters | {",
+              Join(param_ports, "|"), "}}\"];\n");
   }
 
   for (auto& instruction : instructions) {
-    string color = "peachpuff";
-    string shape = "ellipse";
-    string name = instruction->ExtendedOpcodeStr();
+    ColorScheme color = kYellow;
+    string shape = "box";
+    string name =
+        StrCat("<b>", HtmlLikeStringSanitize(instruction->ExtendedOpcodeStr()),
+               "</b> ", HtmlLikeStringSanitize(instruction->name()));
     if (HloOpcode::kConvolution == instruction->opcode()) {
-      name += ":\\n" + instruction->ConvolutionDimensionNumbersToString() +
-              "\\n" + window_util::ToString(instruction->window());
+      StrAppend(
+          &name, "<br/>",
+          HtmlLikeStringSanitize(
+              instruction->ConvolutionDimensionNumbersToString()),
+          "<br/>",
+          HtmlLikeStringSanitize(window_util::ToString(instruction->window())));
     }
 
-    name += "\\n" + instruction->name();
-    if (!instruction->metadata().op_type().empty()) {
-      StrAppend(&name, "\\n", instruction->metadata().op_type());
-    }
     if (!instruction->metadata().op_name().empty()) {
-      StrAppend(&name, "\\n", instruction->metadata().op_name());
+      StrAppend(&name, "<br/>",
+                HtmlLikeStringSanitize(instruction->metadata().op_name()));
     }
     if (!instruction->metadata().source_file().empty() &&
         instruction->metadata().source_line() != 0) {
-      StrAppend(&name, "\\n", instruction->metadata().source_file(), ":",
+      StrAppend(&name, "<br/>", instruction->metadata().source_file(), ":",
                 instruction->metadata().source_line());
     }
 
@@ -140,11 +213,8 @@ string InstructionSequenceGraph(
       case HloOpcode::kAdd:
       case HloOpcode::kCeil:
       case HloOpcode::kClamp:
-      case HloOpcode::kConcatenate:
       case HloOpcode::kConvert:
       case HloOpcode::kDivide:
-      case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
       case HloOpcode::kEq:
       case HloOpcode::kExp:
       case HloOpcode::kFloor:
@@ -163,64 +233,49 @@ string InstructionSequenceGraph(
       case HloOpcode::kMultiply:
       case HloOpcode::kNe:
       case HloOpcode::kNegate:
-      case HloOpcode::kPad:
       case HloOpcode::kPower:
       case HloOpcode::kRemainder:
-      case HloOpcode::kReshape:
-      case HloOpcode::kReverse:
       case HloOpcode::kSelect:
       case HloOpcode::kSign:
       case HloOpcode::kSlice:
       case HloOpcode::kSort:
       case HloOpcode::kSubtract:
       case HloOpcode::kTanh:
-      case HloOpcode::kTuple:
-      case HloOpcode::kUpdate:
-        break;
-
-      case HloOpcode::kBroadcast:
-      case HloOpcode::kTranspose:
-        StrAppend(&name, "\\n", "dims={", Join(instruction->dimensions(), ","),
-                  "}");
-        break;
-      case HloOpcode::kGetTupleElement:
-        StrAppend(&name, "\\nindex=", instruction->tuple_index());
         break;
       case HloOpcode::kRng:
-        StrAppend(&name, "\\n",
+        StrAppend(&name, "<br/>",
                   RandomDistribution_Name(instruction->random_distribution()));
         break;
-      case HloOpcode::kConstant:
-        shape = "box";
-        color = "palegreen";
-        if (ShapeUtil::IsScalar(instruction->shape())) {
-          StrAppend(&name, "\\n", "value=", LiteralUtil::GetAsString(
-                                                instruction->literal(), {}));
-        }
+      case HloOpcode::kBroadcast:
+      case HloOpcode::kTranspose:
+        StrAppend(&name, "<br/>", "dims={",
+                  Join(instruction->dimensions(), ","), "}");
         break;
       case HloOpcode::kBitcast:
-      case HloOpcode::kCopy:
-        color = "white";
-        break;
-      case HloOpcode::kCall:
-        color = "tomato";
-        break;
-      case HloOpcode::kCustomCall:
-        color = "tomato4";
-        StrAppend(&name, "\\n",
-                  "custom_call_target=", instruction->custom_call_target());
+      case HloOpcode::kTuple:
+      case HloOpcode::kTrace:
+        color = kWhite;
         break;
-      case HloOpcode::kDot:
-        color = "slateblue";
+      case HloOpcode::kGetTupleElement:
+        color = kWhite;
+        StrAppend(&name, "<br/>index=", instruction->tuple_index());
         break;
-      case HloOpcode::kSend:
-        color = "purple";
+      case HloOpcode::kConcatenate:
+      case HloOpcode::kCopy:
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
+      case HloOpcode::kPad:
+      case HloOpcode::kReshape:
+      case HloOpcode::kReverse:
+      case HloOpcode::kUpdate:
+        color = kGreen;
         break;
-      case HloOpcode::kRecv:
-        color = "orange";
+      case HloOpcode::kConstant:
+        color = kBlue;
         break;
-      case HloOpcode::kMap:
-        color = "palevioletred";
+      case HloOpcode::kConvolution:
+      case HloOpcode::kDot:
+        color = kDarkBlue;
         break;
       case HloOpcode::kParameter:
         // A single record node is created for all the parameter nodes with a
@@ -229,38 +284,54 @@ string InstructionSequenceGraph(
         continue;
       case HloOpcode::kReduce:
         StrAppend(&name, " dims=", Join(instruction->dimensions(), ","));
-        color = "lightsalmon";
+        color = kPurple;
         break;
       case HloOpcode::kSelectAndScatter:
       case HloOpcode::kReduceWindow:
-        color = "lightsalmon";
-        break;
-      case HloOpcode::kTrace:
-        color = "white";
+        color = kPurple;
         break;
       case HloOpcode::kWhile:
-        color = "forestgreen";
+        shape = "ellipse";
+        color = kDarkGreen;
         break;
+      case HloOpcode::kMap:
       case HloOpcode::kFusion:
-        color = "gray";
-        break;
-      case HloOpcode::kConvolution:
-        color = "red";
-        break;
-      case HloOpcode::kCrossReplicaSum:
-        color = "turquoise";
+        color = kGray;
         break;
+      case HloOpcode::kSend:
+      case HloOpcode::kRecv:
       case HloOpcode::kInfeed:
       case HloOpcode::kOutfeed:
-        color = "blue";
+      case HloOpcode::kCrossReplicaSum:
+        color = kBrown;
+        break;
+      case HloOpcode::kCall:
+        color = kDarkGreen;
+        break;
+      case HloOpcode::kCustomCall:
+        color = kDarkGreen;
+        StrAppend(&name, "<br/>",
+                  "custom_call_target=", instruction->custom_call_target());
         break;
     }
 
     // Create instruction node with appropriate label, shape, and color.
+    // label is interpreted as an HTML-like string, so newlines must be
+    // delimited with <br/>, rather than \n.
     string label =
-        StrCat(name, "\\n", ShapeUtil::HumanString(instruction->shape()));
+        StrCat(name, "<br/>", ShapeUtil::HumanString(instruction->shape()));
+
+    if (instruction->opcode() == HloOpcode::kConstant &&
+        ShapeUtil::IsEffectiveScalar(instruction->shape())) {
+      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
+          instruction->shape(), /*linear_index=*/0);
+      StrAppend(&label, " = {",
+                LiteralUtil::GetAsString(instruction->literal(), elem_idx),
+                "}");
+    }
+
     if (show_addresses) {
-      Appendf(&label, "\\n[%p]", instruction.get());
+      Appendf(&label, "<br/>[%p]", instruction.get());
     }
     if (show_layouts && LayoutUtil::HasLayout(instruction->shape())) {
       string layout_string;
@@ -272,7 +343,7 @@ string InstructionSequenceGraph(
         layout_string =
             Join(instruction->shape().layout().minor_to_major(), ",");
       }
-      StrAppend(&label, "\\nlayout={", layout_string, "}");
+      StrAppend(&label, "<br/>layout={", layout_string, "}");
     }
     if (hlo_execution_profile != nullptr) {
       auto hlo_cycles_executed =
@@ -280,16 +351,16 @@ string InstructionSequenceGraph(
       auto total_cycles_executed =
           hlo_execution_profile->total_cycles_executed(*instruction->parent());
       if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
-        Appendf(&label, "\\n%% of cycles executed=%.2f",
+        Appendf(&label, "<br/>%% of cycles executed=%.2f",
                 (static_cast<double>(hlo_cycles_executed) /
                  static_cast<double>(total_cycles_executed)) *
                     100);
       }
     }
-    Appendf(&graph_body,
-            "%s [label=\"%s\", shape=%s, style=filled, fillcolor=%s];\n",
+
+    Appendf(&graph_body, "%s [label=<%s>, shape=%s, %s];\n",
             InstructionId(instruction.get()).c_str(), label.c_str(),
-            shape.c_str(), color.c_str());
+            shape.c_str(), NodeColorAttributes(color).c_str());
 
     // Create edges from the instruction's operands to the instruction.
     int64 operand_number = 0;
@@ -319,7 +390,7 @@ string InstructionSequenceGraph(
           StrCat("cluster_", InstructionId(instruction.get()));
       StrAppend(&graph_body, "subgraph ", cluster_name, " {\n");
       StrAppend(&graph_body,
-                "label=\"fused expression\";\nstyle=filled;\n"
+                "label=<<b>fused expression</b>>;\nstyle=\"rounded,filled\";\n"
                 "color=lightgrey;\n");
       StrAppend(&graph_body, InstructionSequenceGraph(
                                  instruction->fused_instructions(),
@@ -349,19 +420,39 @@ string InstructionSequenceGraph(
   return graph_body;
 }
 
+// DOT graphs accept a stylesheet as a URL.  So naturally, an inline stylesheet
+// is a data URI!
+//
+// We don't perform any escaping on this string, so be careful not to use double
+// quotes inside.
+static const char* dot_stylesheet = R"(
+data:text/css,
+@import url(https://fonts.googleapis.com/css?family=Roboto:400,700);
+svg text {
+  font-family: 'Roboto';
+  font-size: 12px;
+}
+)";
+
 string ComputationToDotGraph(const HloComputation& computation,
                              const string& label, bool show_addresses,
                              bool show_layouts,
                              const HloExecutionProfile* hlo_execution_profile) {
-  string graph_label = StrCat(label, "\\n", computation.name());
+  string graph_label = StrCat(label, "<br/>", computation.name());
   if (hlo_execution_profile != nullptr) {
     auto cycles = hlo_execution_profile->total_cycles_executed(computation);
-    Appendf(&graph_label, "\\ntotal cycles = %lld (%s)", cycles,
+    Appendf(&graph_label, "<br/>total cycles = %lld (%s)", cycles,
             tensorflow::strings::HumanReadableNum(cycles).c_str());
   }
-  string graph =
-      Printf("digraph G {\nrankdir=TB;\ncompound=true;\nlabel=\"%s\"\n",
-             graph_label.c_str());
+  string graph = Printf(
+      R"(digraph G {
+rankdir=TB;
+compound=true;
+label=<<b>%s</b>>;
+labelloc=t;
+stylesheet="%s"
+)",
+      graph_label.c_str(), dot_stylesheet);
 
   // Emit embedded computations as subgraph clusters.
   std::vector<string> intercomputation_edges;
@@ -369,7 +460,9 @@ string ComputationToDotGraph(const HloComputation& computation,
     string graph_body = InstructionSequenceGraph(
         embedded->instructions(), show_addresses, show_layouts,
         &intercomputation_edges, hlo_execution_profile);
-    Appendf(&graph, "subgraph cluster_%s {\nlabel=\"%s\";\n%s}\n",
+    Appendf(&graph,
+            "subgraph cluster_%s "
+            "{\nstyle=rounded;label=<<b>%s</b>>;labelloc=t;\n%s}\n",
             ComputationId(embedded).c_str(), embedded->name().c_str(),
             graph_body.c_str());
   }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index c9722b942b957e8cb9788d221a79910e9f4c6539..10ab60cc8449a59ef3aefcc12f67e4738d63b900 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -213,10 +213,10 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kConvolution, shape));
   if (window_util::HasBaseDilation(window)) {
-    instruction->set_name(instruction->name() + "-base-dilated");
+    instruction->name_ = instruction->name() + "-base-dilated";
   }
   if (window_util::HasWindowDilation(window)) {
-    instruction->set_name(instruction->name() + "-window-dilated");
+    instruction->name_ = instruction->name() + "-window-dilated";
   }
   instruction->AppendOperand(lhs);
   instruction->AppendOperand(rhs);
@@ -410,7 +410,9 @@ HloInstruction::CreateSelectAndScatter(
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReshape(
     const Shape& shape, HloInstruction* operand) {
   CHECK_EQ(ShapeUtil::ElementsIn(shape),
-           ShapeUtil::ElementsIn(operand->shape()));
+           ShapeUtil::ElementsIn(operand->shape()))
+      << "shape: " << ShapeUtil::HumanString(shape)
+      << " operand: " << ShapeUtil::HumanString(operand->shape());
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kReshape, shape));
   instruction->AppendOperand(operand);
   return instruction;
@@ -505,16 +507,8 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
   HloInstruction* clone = nullptr;
   if (fused_instructions_computation_ == nullptr) {
     // New fusion instruction.
-    string computation_name;
-    HloModule* module = GetModule();
-    if (module) {
-      computation_name = module->GetUniqueCompuationName(
-          instruction_to_fuse->name() + ".fusion");
-    } else {
-      computation_name = instruction_to_fuse->name() + ".fusion";
-    }
-    auto builder = HloComputation::Builder(computation_name, true);
-    builder.AddInstruction(instruction_to_fuse->Clone());
+    auto builder = HloComputation::Builder("fused_computation", true);
+    builder.AddInstruction(instruction_to_fuse->Clone(/*suffix=*/""));
     fused_instructions_computation_ = builder.Build();
     clone = fused_expression_root();
     clone->parent_fusion_instruction_ = this;
@@ -522,7 +516,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     CHECK(fused_instructions_computation_ != nullptr &&
           fused_instructions_computation_->IsFusionComputation());
     clone = fused_instructions_computation_->AddInstruction(
-        instruction_to_fuse->Clone());
+        instruction_to_fuse->Clone(/*suffix=*/""));
     clone->parent_fusion_instruction_ = this;
     // instruction_to_fuse is necessarily an operand of the fusion instruction.
     // After fusion this will no longer be the case. Remove the operand from the
@@ -578,8 +572,13 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
       // instruction. Add it as an operand and add a corresponding fused
       // parameter instruction.
       int64 param_no = fused_parameters_.size();
-      std::unique_ptr<HloInstruction> param_instruction = CreateParameter(
-          param_no, operand->shape(), StrCat("fusion_param.", param_no));
+      // Name the parameter after the instruction it represents in the outer
+      // (non-fusion) computation. Strip the leading "%" from the operand name
+      // to avoid a double %%.
+      string param_name =
+          StrCat(operand->name().substr(1), ".param_", param_no);
+      std::unique_ptr<HloInstruction> param_instruction =
+          CreateParameter(param_no, operand->shape(), param_name);
 
       param_instruction->parent_fusion_instruction_ = this;
       fused_param = fused_instructions_computation_->AddParameter(
@@ -858,32 +857,36 @@ HloInstruction::~HloInstruction() {}
 std::unique_ptr<HloInstruction> HloInstruction::Clone(const string& suffix) {
   std::unique_ptr<HloInstruction> clone =
       CloneWithNewOperands(shape_, operands_);
-  // If an instruction is cloned multiple times avoid names like
-  // foo.suffix.suffix.suffix. Instead of repeating the suffix add a numeric
-  // suffix. Specifically, the clone of foo.suffix is named foo.suffix2, the
-  // clone of foo.suffix2 is named foo.suffix3 and so on.
-  const string dot_suffix = "." + suffix;
-  size_t index = name().rfind(dot_suffix);
-  if (index == string::npos) {
-    // Existing name does not include ".suffix".
-    clone->name_ = name() + dot_suffix;
+  if (suffix.empty()) {
+    clone->name_ = name();
   } else {
-    // Existing name includes ".suffix". Determine if substring after ".suffix"
-    // is numeric and should be replaced with an incremented number.
-    string after_suffix = name().substr(index + dot_suffix.size());
-    if (after_suffix.empty()) {
-      // Existing name ends in ".suffix". New name should end in ".suffix2".
-      clone->name_ = name() + "2";
+    // If an instruction is cloned multiple times avoid names like
+    // foo.suffix.suffix.suffix. Instead of repeating the suffix add a numeric
+    // suffix. Specifically, the clone of foo.suffix is named foo.suffix2, the
+    // clone of foo.suffix2 is named foo.suffix3 and so on.
+    const string dot_suffix = "." + suffix;
+    size_t index = name().rfind(dot_suffix);
+    if (index == string::npos) {
+      // Existing name does not include ".suffix".
+      clone->name_ = name() + dot_suffix;
     } else {
-      // If names ends with .suffix[0-9]+ then replace with a suffix with the
-      // numeric value incremented.
-      int64 numeric_suffix;
-      if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) {
-        clone->name_ =
-            StrCat(name().substr(0, index), dot_suffix, numeric_suffix + 1);
+      // Existing name includes ".suffix". Determine if substring after
+      // ".suffix" is numeric and should be replaced with an incremented number.
+      string after_suffix = name().substr(index + dot_suffix.size());
+      if (after_suffix.empty()) {
+        // Existing name ends in ".suffix". New name should end in ".suffix2".
+        clone->name_ = name() + "2";
       } else {
-        // Substring after ".suffix" is non-numeric.
-        clone->name_ = name() + dot_suffix;
+        // If names ends with .suffix[0-9]+ then replace with a suffix with the
+        // numeric value incremented.
+        int64 numeric_suffix;
+        if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) {
+          clone->name_ =
+              StrCat(name().substr(0, index), dot_suffix, numeric_suffix + 1);
+        } else {
+          // Substring after ".suffix" is non-numeric.
+          clone->name_ = name() + dot_suffix;
+        }
       }
     }
   }
@@ -1080,7 +1083,7 @@ bool HloInstruction::Identical(
   // general, there is no need to check shape because shape is inferred from the
   // shape of the operands.
   if (opcode() != other.opcode() ||
-      !ContainersEqual(operands(), other.operands(), eq_operands)) {
+      !ContainersEqual(operands(), other.operands(), std::move(eq_operands))) {
     return false;
   }
 
@@ -1427,7 +1430,8 @@ string HloInstruction::ExtendedOpcodeStr() const {
   return opc_name;
 }
 
-string HloInstruction::ToString(bool compact_operands) const {
+string HloInstruction::ToString(bool compact_operands,
+                                bool include_metadata) const {
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
@@ -1508,8 +1512,9 @@ string HloInstruction::ToString(bool compact_operands) const {
   if (opcode() == HloOpcode::kGetTupleElement) {
     StrAppend(&extra, ", index=", tuple_index());
   }
-  if (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
-      !metadata_.source_file().empty()) {
+  if (include_metadata &&
+      (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
+       !metadata_.source_file().empty())) {
     StrAppend(&extra, " # metadata=", metadata_.ShortDebugString());
   }
 
@@ -1565,7 +1570,9 @@ string HloInstruction::ToCategory() const {
           return "non-elementwise fusion";
         }
       case FusionKind::kInput:
-        return "reduce fusion";
+        return "input fusion";
+      case FusionKind::kOutput:
+        return "output fusion";
       case FusionKind::kTransposeDot:
         return "dot fusion";
       case FusionKind::kConvBackwardFilter:
@@ -1613,7 +1620,6 @@ bool HloInstruction::IsFusable() const {
 
   // Some kinds of instructions don't make sense to fuse.
   switch (opcode_) {
-    case HloOpcode::kFusion:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
@@ -2181,6 +2187,8 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kLoop";
     case HloInstruction::FusionKind::kInput:
       return "kInput";
+    case HloInstruction::FusionKind::kOutput:
+      return "kOutput";
     case HloInstruction::FusionKind::kTransposeDot:
       return "kTransposeDot";
     case HloInstruction::FusionKind::kConvBackwardFilter:
@@ -2256,4 +2264,9 @@ HloModule* HloInstruction::GetModule() const {
   }
   return nullptr;
 }
+
+void HloInstruction::UniquifyName(NameUniquer* name_uniquer) {
+  name_ = name_uniquer->GetUniqueName(name_);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 3a03f28975e67880871c1e9f7d1d140e4b328c16..d300d99adec5201b70b0fe4eb65ef5b84362b018 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -53,7 +54,10 @@ class HloInstruction {
  public:
   enum class FusionKind {
     kLoop,                // Fused into a loop.
-    kInput,               // Fused into a reduction kernel.
+    kInput,               // Op's input is fused into the op itself.
+    kOutput,              // Op's output is fused into the op itself.
+                          // REQUIRES: At least one operand buffer must be able
+                          // to alias the output buffer.
     kTransposeDot,        // Fused into a dot with transposed operands.
     kConvBackwardFilter,  // Fused into a backward filter convolution.
     kConvBackwardInput,   // Fused into a backward input convolution.
@@ -488,7 +492,10 @@ class HloInstruction {
   string SignatureString() const;
 
   // Returns a debugging string that represents this instruction.
-  string ToString(bool compact_operands = false) const;
+  string ToString(bool compact_operands = false,
+                  bool include_metadata = true) const;
+
+  string ToStringNoMetadata() const { return ToString(false, false); }
 
   // As ToString, but returns a shorter string.
   string ToShortString() const;
@@ -497,7 +504,9 @@ class HloInstruction {
   // or "elementwise".
   string ToCategory() const;
 
-  // Returns the string concatenation of parent name and this instructions name.
+  // Returns the string concatenation of parent name and this instructions
+  // name. This name is guaranteed to be unique among all instructions in the
+  // HloModule.
   string FullyQualifiedName() const;
 
   // Returns a logging instruction, if the output of this instruction is logged.
@@ -721,8 +730,9 @@ class HloInstruction {
   // this instruction.
   const string& name() const { return name_; }
 
-  // Sets the string identifier for this instruction.
-  void set_name(const string& name) { name_ = name; }
+  // Use the given NameUniquer to select a unique name for the instruction based
+  // on the instruction's existing name.
+  void UniquifyName(NameUniquer* name_uniquer);
 
   // Sets the debug metadata for this instruction.
   void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index d0101ef19c34f27628a8a48607aad78f85e6d0f3..a226ab0d0c43e6df6216e4b0f58ed4270cb03d40 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -966,34 +966,39 @@ TEST_F(HloInstructionTest, CloneSuffixNames) {
   // Test that the suffix string added to cloned instructions is not
   // duplicated. Rather a numeric incrementing value should be appended. That
   // is, we want "foo.clone2", not "foo.clone.clone".
-  auto foo = HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.f));
-  foo->set_name("foo");
 
   // Test cloning the same instruction multiple times.
-  EXPECT_EQ(foo->Clone()->name(), "foo.clone");
-  EXPECT_EQ(foo->Clone()->Clone()->name(), "foo.clone2");
-  EXPECT_EQ(foo->Clone()->Clone()->Clone()->name(), "foo.clone3");
+  auto foo =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "foo");
+  EXPECT_EQ(foo->Clone()->name(), "%foo.clone");
+  EXPECT_EQ(foo->Clone()->Clone()->name(), "%foo.clone2");
+  EXPECT_EQ(foo->Clone()->Clone()->Clone()->name(), "%foo.clone3");
 
   // Test custom suffixes.
-  EXPECT_EQ(foo->Clone("bar")->name(), "foo.bar");
-  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->name(), "foo.bar2");
-  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->Clone()->name(), "foo.bar2.clone");
+  EXPECT_EQ(foo->Clone("bar")->name(), "%foo.bar");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->name(), "%foo.bar2");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->Clone()->name(),
+            "%foo.bar2.clone");
 
   // Test instruction name with a dot.
-  foo->set_name("foo.baz");
-  EXPECT_EQ(foo->Clone()->name(), "foo.baz.clone");
+  auto foo_baz = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.baz");
+  EXPECT_EQ(foo_baz->Clone()->name(), "%foo.baz.clone");
 
   // Test incrementing a large number after the suffix.
-  foo->set_name("foo.clone234");
-  EXPECT_EQ(foo->Clone()->name(), "foo.clone235");
+  auto foo_clone234 = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.clone234");
+  EXPECT_EQ(foo_clone234->Clone()->name(), "%foo.clone235");
 
   // Test a non-numeric string after the cloning suffix.
-  foo->set_name("foo.clonexyz");
-  EXPECT_EQ(foo->Clone()->name(), "foo.clonexyz.clone");
+  auto foo_clonexyz = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.clonexyz");
+  EXPECT_EQ(foo_clonexyz->Clone()->name(), "%foo.clonexyz.clone");
 
   // Test a name with multiple appearances of the suffix.
-  foo->set_name("foo.clone.clone3");
-  EXPECT_EQ(foo->Clone()->name(), "foo.clone.clone4");
+  auto foo_clone_clone3 = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.clone.clone3");
+  EXPECT_EQ(foo_clone_clone3->Clone()->name(), "%foo.clone.clone4");
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 8ed672aa9b8fb73cc120f55d93530b3124519fcb..f5e13b4367bed5b029862f76ce2dd9eeb2b42c49 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -46,8 +46,7 @@ HloModule::HloModule(const string& name)
 
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation) {
-  computation->set_name(
-      computation_name_uniquer_.GetUniqueName(computation->name()));
+  computation->UniquifyName(&computation_name_uniquer_);
   computation->set_parent(this);
   computations_.push_back(std::move(computation));
   return computations_.back().get();
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index b3168ed40ece3ea65c6b26b96250f2ea77969953..725ce17d6640fbbddbf11f4ca50c50c8c57e9bd3 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -34,15 +34,95 @@ limitations under the License.
 
 namespace xla {
 
-PredecessorHloOrdering::PredecessorHloOrdering(const HloModule* module)
-    : module_(module) {}
+namespace {
+
+// Returns the nearest call graph ancestors of instructions 'a' and 'b' for
+// which the ancestors are in the same computation. An instruction is an call
+// graph ancestor of 'a' if the instruction calls the computation containing 'a'
+// either directly or transitively. Degeneratively an instruction is an ancestor
+// of itself. nullptr is returned if there is no common ancestor or if the
+// caller chain of 'a' or 'b' diverges (has multiple callers) before the nearest
+// common ancestor.
+//
+// Example:
+//
+// Entry computation:
+//   %x = Call(A, {Constant(42.0)})
+//   %y = Call(B, {%x})
+//
+// Computation A:
+//   %a = Negate(Param())
+//
+// Computation B:
+//   %b = Exp(Param());
+//
+// If called with %a and %b, this function would return (%x, %y). %x is an
+// ancestor of %a, and %y is an ancestor of %b, and %x and %y are in the same
+// computation.
+std::pair<const HloInstruction*, const HloInstruction*>
+GetNearestCallGraphAncestorsInSameComputation(const HloInstruction* a,
+                                              const HloInstruction* b,
+                                              const CallGraph& call_graph) {
+  // Lambda which returns the next instruction in the callee->caller chain in
+  // the call graph. This is the unique instruction which calls the computation
+  // containing 'instruction'. If more than one instruction calls the
+  // computation containing 'instruction' or no instructions call the
+  // computation then nullptr is returned.
+  auto next_caller =
+      [&call_graph](
+          const HloInstruction* instruction) -> const HloInstruction* {
+    const CallGraphNode& node = call_graph.GetNode(instruction->parent());
+    if (node.caller_callsites().size() != 1) {
+      return nullptr;
+    }
+    return node.caller_callsites()[0].instruction();
+  };
+
+  // Iterate through the callee->caller chains and find the earliest common
+  // element.
+  for (const HloInstruction* a_ancestor = a; a_ancestor != nullptr;
+       a_ancestor = next_caller(a_ancestor)) {
+    for (const HloInstruction* b_ancestor = b; b_ancestor != nullptr;
+         b_ancestor = next_caller(b_ancestor)) {
+      if (a_ancestor->parent() == b_ancestor->parent()) {
+        return {a_ancestor, b_ancestor};
+      }
+    }
+  }
+  return {nullptr, nullptr};
+}
+
+}  // namespace
 
-bool PredecessorHloOrdering::ExecutesBefore(const HloInstruction* a,
-                                            const HloInstruction* b) const {
-  // Instructions in different computations are unordered.
-  if (a->parent() != b->parent()) {
+bool HloOrdering::ExecutesBefore(const HloInstruction* a,
+                                 const HloInstruction* b) const {
+  // 'a' and 'b' may be in different computations. In this case, find the
+  // callgraph ancestor instructions which call (potentially transitively) the
+  // computations containing 'a' and 'b' and use these ancestor instructions to
+  // compare order.
+  const HloInstruction* a_ancestor;
+  const HloInstruction* b_ancestor;
+  std::tie(a_ancestor, b_ancestor) =
+      GetNearestCallGraphAncestorsInSameComputation(a, b, *call_graph_);
+
+  if (a_ancestor == nullptr) {
+    // Ancestors in a common computation could not be found so consider the
+    // instructions 'a' and 'b' to be unordered.
     return false;
   }
+  // a_ancestor and b_ancestor must be either both null or both non-null.
+  CHECK_NE(b_ancestor, nullptr);
+  CHECK_EQ(a_ancestor->parent(), b_ancestor->parent());
+  return ExecutesBeforeInSameComputation(a_ancestor, b_ancestor);
+}
+
+PredecessorHloOrdering::PredecessorHloOrdering(const HloModule* module)
+    : HloOrdering(module) {}
+
+bool PredecessorHloOrdering::ExecutesBeforeInSameComputation(
+    const HloInstruction* a, const HloInstruction* b) const {
+  CHECK_EQ(a->parent(), b->parent());
+
   // 'a' executes before 'b' if 'a' is in the strict predecessor set of 'b'.
   return strict_predecessors_.at(b->parent())->IsReachable(b, a);
 }
@@ -86,7 +166,7 @@ string DependencyHloOrdering::ToString() const {
 
 SequentialHloOrdering::SequentialHloOrdering(
     const HloModule* module, const HloModuleSequence& module_sequence)
-    : module_(module), module_sequence_(module_sequence) {
+    : HloOrdering(module), module_sequence_(module_sequence) {
   // Create a map from instruction to its order position.
   for (auto computation_order : module_sequence_) {
     const std::vector<const HloInstruction*>& order = computation_order.second;
@@ -97,12 +177,9 @@ SequentialHloOrdering::SequentialHloOrdering(
   }
 }
 
-bool SequentialHloOrdering::ExecutesBefore(const HloInstruction* a,
-                                           const HloInstruction* b) const {
-  // Instructions in different computations are unordered.
-  if (a->parent() != b->parent()) {
-    return false;
-  }
+bool SequentialHloOrdering::ExecutesBeforeInSameComputation(
+    const HloInstruction* a, const HloInstruction* b) const {
+  CHECK_EQ(a->parent(), b->parent());
   // If either instruction is not in the order, then 'a' and 'b' are unordered.
   if (order_position_.count(a) == 0 || order_position_.count(b) == 0) {
     return false;
@@ -144,23 +221,6 @@ string SequentialHloOrdering::ToString() const {
   return tensorflow::str_util::Join(pieces, "\n");
 }
 
-namespace {
-StatusOr<int64> MinimumMemoryForSequence(
-    const HloComputation& computation,
-    const std::vector<const HloInstruction*>& sequence,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  // The absolute minimum memory required for a given sequence of instructions
-  // is determined by the sequence of Alloc and Free calls on a simulated heap,
-  // ignoring fragmentation.
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), sequence,
-                         computation, points_to_analysis, size_function));
-  return result.heap_size;
-}
-}  // namespace
-
 StatusOr<int64> MinimumMemoryForSequence(
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const LogicalBuffer::SizeFunction& size_function) {
@@ -172,17 +232,16 @@ StatusOr<int64> MinimumMemoryForSequence(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
 
-  int64 total_memory = 0;
-  for (const auto& pair : module_sequence) {
-    const HloComputation* computation = pair.first;
-    const std::vector<const HloInstruction*>& sequence = pair.second;
-    TF_ASSIGN_OR_RETURN(
-        const int64 memory,
-        MinimumMemoryForSequence(*computation, sequence, *points_to_analysis,
-                                 size_function));
-    total_memory += memory;
-  }
-  return total_memory;
+  // The absolute minimum memory required for a given sequence of instructions
+  // is determined by the sequence of Alloc and Free calls on a simulated heap,
+  // ignoring fragmentation. We run the heap simulation on the whole module,
+  // rather than summing each computation, since it gives us a better lower
+  // bound, by minimizing the liveness of sub-computations.
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), *module,
+                         module_sequence, *points_to_analysis, size_function));
+  return result.heap_size;
 }
 
 namespace {
@@ -439,6 +498,18 @@ StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
   return sequence;
 }
 
+StatusOr<int64> MinimumMemoryForComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
+                         sequence, points_to_analysis, size_function));
+  return result.heap_size;
+}
+
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
@@ -446,13 +517,17 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
   // We try both a list-scheduler based ordering and a DFS based ordering, and
   // choose whichever returns a lower min-memory, not accounting for
   // fragmentation.
+  //
+  // Note that this is just a heuristic. One obvious inaccuracy is that the
+  // memory required for sub-computations might be different when considered
+  // within the caller's context. But it's good enough for now.
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> list_sequence,
       ListScheduler::Run(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 list_memory,
-      MinimumMemoryForSequence(computation, list_sequence, points_to_analysis,
-                               size_function));
+      MinimumMemoryForComputation(computation, list_sequence,
+                                  points_to_analysis, size_function));
   VLOG(2) << "Min-memory list sequence: " << list_memory << " bytes";
 
   TF_ASSIGN_OR_RETURN(
@@ -460,8 +535,8 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
       RunDFSMemoryScheduler(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 dfs_memory,
-      MinimumMemoryForSequence(computation, dfs_sequence, points_to_analysis,
-                               size_function));
+      MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
+                                  size_function));
   VLOG(2) << "Min-memory dfs sequence: " << dfs_memory << " bytes";
 
   if (list_memory <= dfs_memory) {
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h
index e964c4c51ae14f89d1f1b0450990cfc50c8a74be..d2db18be0009b1ca62b538d3975e1a0a105c5e83 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.h
+++ b/tensorflow/compiler/xla/service/hlo_ordering.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -36,13 +37,13 @@ namespace xla {
 // buffers.
 class HloOrdering {
  public:
-  HloOrdering() = default;
+  HloOrdering(const HloModule* module)
+      : module_(module), call_graph_(CallGraph::Build(module)) {}
   virtual ~HloOrdering() = default;
 
   // Returns true if instruction 'a' executes before instruction 'b'. This is
   // not reflexive, that is, an instruction does not execute before itself.
-  virtual bool ExecutesBefore(const HloInstruction* a,
-                              const HloInstruction* b) const = 0;
+  bool ExecutesBefore(const HloInstruction* a, const HloInstruction* b) const;
 
   // Returns the sequential instruction order for the given computation, or
   // nullptr if the computation does not have a sequential ordering.
@@ -50,6 +51,21 @@ class HloOrdering {
       const HloComputation& computation) const = 0;
 
   virtual string ToString() const = 0;
+
+ protected:
+  // Returns true if instruction 'a' executes before instruction 'b'.
+  // Precondition: 'a' and 'b' are in the same computation.
+  //
+  // Derived classes should implement this method for determining order of
+  // instructions in the same comptuation. ExecutesBefore() analyzes the
+  // callgraph and uses this method to determine ordering of instructions in
+  // different computations.
+  virtual bool ExecutesBeforeInSameComputation(
+      const HloInstruction* a, const HloInstruction* b) const = 0;
+
+  const HloModule* module_;
+
+  std::unique_ptr<CallGraph> call_graph_;
 };
 
 // Base class for partial orderings implemented by a map of strict predecessors
@@ -58,11 +74,6 @@ class PredecessorHloOrdering : public HloOrdering {
  public:
   ~PredecessorHloOrdering() override = default;
 
-  // Returns true if instruction 'a' executes before instruction 'b'.
-  // Instructions in different computations are not ordered.
-  bool ExecutesBefore(const HloInstruction* a,
-                      const HloInstruction* b) const override;
-
   // Returns nullptr indicating the computation does not have a sequential
   // ordering.
   const std::vector<const HloInstruction*>* SequentialOrder(
@@ -74,11 +85,12 @@ class PredecessorHloOrdering : public HloOrdering {
   explicit PredecessorHloOrdering(const HloModule* module);
   string ToStringHelper(const string& name) const;
 
-  const HloModule* module_;
+  bool ExecutesBeforeInSameComputation(const HloInstruction* a,
+                                       const HloInstruction* b) const override;
 
-  // For each each computation in the module, this is the set of the
-  // instruction's strict predecessors. An instruction is not an element of its
-  // own strict predecessor set.
+  // For each computation in the module, this is the set of the instruction's
+  // strict predecessors. An instruction is not an element of its own strict
+  // predecessor set.
   //
   // Subclasses should fill this in to define the desired ordering.
   tensorflow::gtl::FlatMap<const HloComputation*,
@@ -150,12 +162,6 @@ class SequentialHloOrdering : public HloOrdering {
                         const HloModuleSequence& module_sequence);
   ~SequentialHloOrdering() override = default;
 
-  // Instruction 'a' executes before 'b' if 'a' appears before 'b' in the
-  // instruction sequence for the computation. Instructions in different
-  // computations are unordered.
-  bool ExecutesBefore(const HloInstruction* a,
-                      const HloInstruction* b) const override;
-
   // Returns the sequential instruction order for the given computation.
   const std::vector<const HloInstruction*>* SequentialOrder(
       const HloComputation& computation) const override;
@@ -163,7 +169,9 @@ class SequentialHloOrdering : public HloOrdering {
   string ToString() const override;
 
  protected:
-  const HloModule* module_;
+  bool ExecutesBeforeInSameComputation(const HloInstruction* a,
+                                       const HloInstruction* b) const override;
+
   const HloModuleSequence module_sequence_;
 
   // The position of every instruction in the HLO module in its respective
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 425bee601a8d6357e21d3d00f8ccf5d69af03862..c387fbb89b196c340852db057754f85e3e5435f3 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -78,6 +78,142 @@ TEST_F(HloOrderingTest, LastUseScheduledFirst) {
   EXPECT_TRUE(ordering.ExecutesBefore(add, negate));
 }
 
+TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
+  // Tests the ordering of instructions in different computations using the
+  // following HLO code:
+  //
+  // Entry computation:
+  //   %x = Call(A, {})
+  //   %y = Call(B, {%x})
+  //
+  // Computation A:
+  //   %a = Call(C, {})
+  //
+  // Computation B:
+  //   %b = Call(C, {})
+  //
+  // Computation C:
+  //   %c = Constant(42.0f)
+  //
+  // This results in a diamond-shaped callgraph.
+  HloModule module(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+
+  auto builder_c = HloComputation::Builder("C");
+  HloInstruction* c = builder_c.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  HloComputation* computation_c =
+      module.AddEmbeddedComputation(builder_c.Build());
+
+  auto builder_b = HloComputation::Builder("B");
+  builder_b.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  HloInstruction* b = builder_b.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {}, computation_c));
+  HloComputation* computation_b =
+      module.AddEmbeddedComputation(builder_b.Build());
+
+  auto builder_a = HloComputation::Builder("A");
+  HloInstruction* a = builder_a.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {}, computation_c));
+  HloComputation* computation_a =
+      module.AddEmbeddedComputation(builder_a.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* x = builder.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {}, computation_a));
+  HloInstruction* y = builder.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {x}, computation_b));
+  module.AddEntryComputation(builder.Build());
+
+  DependencyHloOrdering ordering(&module);
+  EXPECT_TRUE(ordering.ExecutesBefore(x, y));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, x));
+
+  EXPECT_TRUE(ordering.ExecutesBefore(a, b));
+  EXPECT_FALSE(ordering.ExecutesBefore(b, a));
+
+  EXPECT_FALSE(ordering.ExecutesBefore(a, x));
+  EXPECT_TRUE(ordering.ExecutesBefore(a, y));
+  EXPECT_FALSE(ordering.ExecutesBefore(x, a));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, a));
+
+  EXPECT_FALSE(ordering.ExecutesBefore(b, x));
+  EXPECT_FALSE(ordering.ExecutesBefore(b, y));
+  EXPECT_TRUE(ordering.ExecutesBefore(x, b));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, b));
+
+  // Instruction 'c' is called from multiple callsites and should be unordered
+  // relative to all other instructions in the module.
+  EXPECT_FALSE(ordering.ExecutesBefore(c, a));
+  EXPECT_FALSE(ordering.ExecutesBefore(c, b));
+  EXPECT_FALSE(ordering.ExecutesBefore(c, x));
+  EXPECT_FALSE(ordering.ExecutesBefore(c, y));
+  EXPECT_FALSE(ordering.ExecutesBefore(a, c));
+  EXPECT_FALSE(ordering.ExecutesBefore(b, c));
+  EXPECT_FALSE(ordering.ExecutesBefore(x, c));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, c));
+}
+
+class MinimumMemoryForSequenceTest : public HloTestBase {};
+
+TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
+  HloModule module(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
+  HloInstruction* cond_data = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte)
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                   HloOpcode::kLt, cond_iter, cond_data));
+  HloComputation* cond_computation =
+      module.AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloComputation* body_computation =
+      module.AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  // Entry params: 8 bytes (4 bytes per param), TOTAL=8
+  HloInstruction* iter = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param_iter"));
+  HloInstruction* data = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "param_data"));
+  // Tuple: 16 bytes (8 bytes per pointer), TOTAL=24
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({iter, data}));
+  // While: 8 bytes (4 bytes per element), TOTAL=32
+  // Both cond and body use a max of 24 bytes, TOTAL=56
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, tuple));
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build());
+
+  auto size_fn = [](const LogicalBuffer& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  };
+
+  SequentialHloOrdering::HloModuleSequence module_sequence;
+  module_sequence[cond_computation] = {cond_param, cond_iter, cond_data,
+                                       cond_lt};
+  module_sequence[body_computation] = {body_param};
+  module_sequence[entry_computation] = {iter, data, tuple, while_op};
+  EXPECT_EQ(56,
+            MinimumMemoryForSequence(module_sequence, size_fn).ValueOrDie());
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index eb7fe467b32a330d9b8ad6000ad47849288b6b7e..78aebe9c36dfb5f63099f5e2df7bffe8529b08de 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -42,11 +42,17 @@ void DumpModule(const Compiler::HloDumper& dumper_, const HloModule& module,
 StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   run_called_ = true;
 
+  VLOG(1) << "Running HLO pass pipeline " << name();
+
   legacy_flags::HloPassPipelineFlags* flags =
       legacy_flags::GetHloPassPipelineFlags();
   std::vector<string> tmp =
       tensorflow::str_util::Split(flags->xla_disable_hlo_passes, ',');
   tensorflow::gtl::FlatSet<string> disabled_passes(tmp.begin(), tmp.end());
+  if (!disabled_passes.empty()) {
+    VLOG(1) << "Passes disabled by --xla_disable_hlo_passes: "
+            << tensorflow::str_util::Join(disabled_passes, ", ");
+  }
 
   auto run_invariant_checkers = [this, module]() -> Status {
     for (auto& invariant_checker : invariant_checkers_) {
@@ -62,9 +68,13 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   for (auto& pass : passes_) {
     if (!disabled_passes.empty() &&
         disabled_passes.count(pass->name().ToString()) > 0) {
+      VLOG(1) << "  Skipping HLO pass " << pass->name()
+              << ", disabled by --xla_disable_hlo_passes";
       continue;
     }
 
+    VLOG(1) << "  HLO pass " << pass->name();
+
     // Emit label containing: "after foo-pass, before bar-pass".
     message.clear();
     StrAppend(&message, prefix, ", before ", pass->name());
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index d6997378642cf480402b2edf8f40ed875fefa517..a153d73dbd838663c0d7e0d72ad54668f243f2c2 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -32,6 +32,16 @@ bool IsConstantR0F32(HloInstruction* instruction, float* out) {
   return false;
 }
 
+bool AllOperandsAreParametersOrConstants(const HloInstruction& instruction) {
+  for (const auto& operand : instruction.operands()) {
+    if (operand->opcode() != HloOpcode::kParameter &&
+        operand->opcode() != HloOpcode::kConstant) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool AllOperandsAreParameters(const HloInstruction& instruction) {
   for (const auto& operand : instruction.operands()) {
     if (operand->opcode() != HloOpcode::kParameter) {
diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h
index 56f3cfd863ce0b9004d14e6c43d41f21b6e7a3bf..c79347bbf9d6146943b7b787f713369cb37fadee 100644
--- a/tensorflow/compiler/xla/service/hlo_query.h
+++ b/tensorflow/compiler/xla/service/hlo_query.h
@@ -28,6 +28,10 @@ namespace hlo_query {
 // Precondition: out != nullptr
 bool IsConstantR0F32(HloInstruction* instruction, float* out);
 
+// Returns whether all of an instruction's operands are of the types constants
+// and parameters.
+bool AllOperandsAreParametersOrConstants(const HloInstruction& instruction);
+
 // Returns whether all of an instruction's operands are parameters.
 bool AllOperandsAreParameters(const HloInstruction& instruction);
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 101c9076f8d1cb3e079ad665177751ccccfe65d9..5d4fd7c2deae7e1b03f49f123e2aff174ab34667 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -46,63 +46,58 @@ namespace xla {
 
 namespace {
 
-// Returns a vector of the operands of 'instruction' with repeated elements
-// removed.
-std::vector<HloInstruction*> UniqueOperands(const HloInstruction* instruction) {
-  std::vector<HloInstruction*> unique_operands;
-  for (HloInstruction* operand : instruction->operands()) {
-    if (std::find(unique_operands.begin(), unique_operands.end(), operand) ==
-        unique_operands.end()) {
-      unique_operands.push_back(operand);
-    }
-  }
-  return unique_operands;
-}
-
 // Returns true if the given instruction is rematerializable.
 bool IsRematerializable(const HloInstruction* instruction) {
+  // Conservatively, don't rematerialize instruction with control
+  // dependencies. For one, control dependencies are added to prevent
+  // interference of aliased buffers (say, in while bodies) and
+  // rematerialization is ignorant of liveness and may break the intended
+  // ordering.
+  if (!instruction->control_predecessors().empty() ||
+      !instruction->control_successors().empty()) {
+    return false;
+  }
+
   // Don't rematerialize instructions with side effects, those with a cost that
   // might not be captured by HloCostAnalysis, or instructions which cannot be
   // cloned safely.
   switch (instruction->opcode()) {
     case HloOpcode::kCall:
+    case HloOpcode::kConstant:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kCustomCall:
     case HloOpcode::kOutfeed:
     case HloOpcode::kInfeed:
+    case HloOpcode::kParameter:
     case HloOpcode::kRecv:
     case HloOpcode::kSend:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
       return false;
     default:
-      break;
-  }
-
-  // Skip tuple shapes because we do not currently account for buffer aliasing
-  // properly which results in improperly accounting of rematerialization cost
-  // for these shapes.
-  if (ShapeUtil::IsTuple(instruction->shape())) {
-    return false;
-  }
-  for (auto* operand : instruction->operands()) {
-    if (ShapeUtil::IsTuple(operand->shape())) {
-      return false;
-    }
+      return true;
   }
-
-  return true;
 }
 
-// Class which maintains an ordered list of instructions with fast insertion and
-// removal of arbitrary elements.
+// Class which maintains an ordered list of instructions with fast insertion
+// before arbitrary elements.
 class InstructionList {
  public:
   explicit InstructionList(const std::vector<const HloInstruction*> order) {
+    int64 position = 0;
     for (const HloInstruction* inst : order) {
       instructions_.push_back(const_cast<HloInstruction*>(inst));
       instruction_iterators_.insert({const_cast<HloInstruction*>(inst),
                                      std::next(instructions_.end(), -1)});
+      // Initially position numbers are uniquely assigned in order. Later as
+      // instructions are added with InsertBefore* methods, some instructions
+      // may have duplicate position numbers, but the values will be guaranteed
+      // to be monotonically increasing through the list, and so is still useful
+      // for quickly(-ish) determining the order of arbitrary instructions in
+      // the list.
+      position_number_[inst] = position;
+      first_at_position_[position] = inst;
+      position++;
     }
   }
 
@@ -111,22 +106,63 @@ class InstructionList {
     return instructions_;
   }
 
-  // Insert instruction 'to_insert' before instruction 'before' in the list.
-  Status InsertBefore(HloInstruction* to_insert, HloInstruction* before) {
+  // Insert instruction 'to_insert' immediately before instruction 'before' in
+  // the list.
+  void InsertBefore(HloInstruction* to_insert, HloInstruction* before) {
+    VLOG(3) << "InsertBefore: " << to_insert->name() << " before "
+            << before->name();
     auto it = instruction_iterators_.find(before);
-    TF_RET_CHECK(it != instruction_iterators_.end());
+    CHECK(it != instruction_iterators_.end());
     instruction_iterators_.insert(
         {to_insert, instructions_.insert(it->second, to_insert)});
-    return Status::OK();
+    // Assign the same position number to the newly added instruction as
+    // 'before'. This guarantees monotonicity of the position numbers, but not
+    // uniqueness.
+    int64 pos = position_number_.at(before);
+    position_number_[to_insert] = pos;
+    if (first_at_position_.at(pos) == before) {
+      first_at_position_[pos] = to_insert;
+    }
   }
 
-  // Removes instruction from the list.
-  Status Remove(HloInstruction* instruction) {
-    auto it = instruction_iterators_.find(instruction);
-    TF_RET_CHECK(it != instruction_iterators_.end());
-    instructions_.erase(it->second);
-    instruction_iterators_.erase(it);
-    return Status::OK();
+  // Insert instruction 'to_insert' immediately before the earliest instruction
+  // in 'before_instructions'.
+  void InsertBeforeInstructions(
+      HloInstruction* to_insert,
+      tensorflow::gtl::ArraySlice<HloInstruction*> before_instructions) {
+    VLOG(3) << "InsertBeforeInstructions: " << to_insert->name() << " before {"
+            << tensorflow::str_util::Join(
+                   before_instructions, ", ",
+                   [](string* out, HloInstruction* inst) {
+                     tensorflow::strings::StrAppend(out, inst->name());
+                   })
+            << "}";
+
+    // Find the minimal position number of any instruction in
+    // 'before_instructions'.
+    CHECK(!before_instructions.empty());
+    int64 min_position_number = std::numeric_limits<int64>::max();
+    for (const HloInstruction* instruction : before_instructions) {
+      min_position_number =
+          std::min(min_position_number, position_number_.at(instruction));
+    }
+
+    // Because more than one instruction in 'before_instructions' may have a
+    // position number of 'min_position_number', find the first such instruction
+    // with position number 'min_position_number'.
+    for (auto it = instruction_iterators_.at(
+             first_at_position_.at(min_position_number));
+         it != instructions_.end() &&
+         position_number_.at(*it) == min_position_number;
+         ++it) {
+      if (std::find(before_instructions.begin(), before_instructions.end(),
+                    *it) != before_instructions.end()) {
+        return InsertBefore(to_insert, *it);
+      }
+    }
+    LOG(FATAL) << "Expected to find instruction in before_instructions with "
+                  "position number "
+               << min_position_number;
   }
 
  private:
@@ -137,283 +173,630 @@ class InstructionList {
   tensorflow::gtl::FlatMap<const HloInstruction*,
                            std::list<HloInstruction*>::iterator>
       instruction_iterators_;
+
+  // A number assigned to each instruction which increases monotonically through
+  // 'instructions_'. Used to facilitate fast insertion of an instruction before
+  // the earliest instruction in a set of instructions
+  // (InsertBeforeInstructions) by enabling fast-ish ordering queries between
+  // instructions. If position_number_[a] < position_number_[b] then 'a' comes
+  // before 'b' in the list. If the position numbers are the same then nothing
+  // can be said about their order without examining the list.
+  //
+  // On object construction this value is precisely the instruction's ordinal
+  // position in the list. Instructions inserted via InsertBefore receive
+  // duplicate values. However, monotonicity is preserved.
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> position_number_;
+
+  // The first instruction in the list assigned a particular position number.
+  tensorflow::gtl::FlatMap<int64, const HloInstruction*> first_at_position_;
 };
 
+// Return the HloInstructions which use the given LogicalBuffer. Sets
+// has_indirect_users to whether any of the uses is indirect. A use is indirect
+// if the instruction defining logical_buffer is not an operand of the use. This
+// can happen via buffer aliasing (eg, tuples).
+std::vector<const HloInstruction*> GetUsers(
+    const LogicalBuffer* logical_buffer,
+    const TuplePointsToAnalysis& points_to_analysis, bool* has_indirect_users) {
+  std::vector<const HloInstruction*> users;
+  // To identify uses iterate through all HloInstruction users of the
+  // BufferAliases of the logical buffer.
+  *has_indirect_users = false;
+  for (const BufferAlias& buffer_alias :
+       points_to_analysis.GetBufferAliases(*logical_buffer)) {
+    for (const HloInstruction* user : buffer_alias.instruction()->users()) {
+      if (DoesNotUseOperandBuffer(buffer_alias.instruction(),
+                                  buffer_alias.index(), user,
+                                  points_to_analysis)) {
+        // The alias may be an operand of 'user', but the LogicalBuffer cannot
+        // possibly be used by the instruction so ignore 'user'. This is the
+        // case, for example, for the tuple element buffers in a GetTupleElement
+        // instruction (the GTE instruction only uses the pointer vector).
+        continue;
+      }
+      if (buffer_alias.instruction() != logical_buffer->instruction()) {
+        *has_indirect_users = true;
+      }
+      // A buffer may be used by the instruction via more than one alias. For
+      // example, a buffer which appears in more than one element of a tuple.
+      if (std::find(users.begin(), users.end(), user) == users.end()) {
+        users.push_back(user);
+      }
+    }
+  }
+  return users;
+}
+
 // Class for tracking memory usage of a computation as the instructions are
-// placed sequentially. Memory usage is the sum of live values at the current
-// point in the instruction sequence.
+// placed sequentially. Memory usage is the sum of the sizes of live values
+// (LogicalBuffers) at the current point in the instruction sequence.
 class MemoryUsageTracker {
  public:
   MemoryUsageTracker(
       const HloComputation* computation,
-      const HloRematerialization::ShapeSizeFunction& size_function)
-      : computation_(computation), size_function_(size_function) {
-    for (const std::unique_ptr<HloInstruction>& instruction :
-         computation->instructions()) {
-      // Initially only live-in values occupy memory.
-      if (IsLiveIn(instruction.get())) {
-        memory_usage_ += TotalSizeBytes(instruction->shape());
-      }
+      const HloRematerialization::ShapeSizeFunction& size_function,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const InstructionList& instruction_list);
+
+  // Starts the placement of the given instruction. This adds the sizes of the
+  // LogicalBuffers defined by the instruction to the current memory
+  // usage. Placement is broken into two steps (BeginInstruction and
+  // EndInstruction) to accurately model memory usage. At BeginInstruction the
+  // memory for the output value(s) of the current instruction is allocated. At
+  // EndInstruction memory for dead operand(s) is freed.
+  Status BeginInstruction(const HloInstruction* instruction);
+
+  // Finishes the placement of the current instruction. This frees any dead
+  // operands or dead result of the instruction. This must be called after
+  // each call to BeginInstruction.
+  Status EndInstruction();
+
+  // Returns the number of bytes that the current memory usage will be reduced
+  // if the given instruction is rematerialized.
+  int64 MemoryReducedIfRematerialized(const HloInstruction* instruction) const;
+
+  // Adjusts memory usage to account for the rematerialization of
+  // original_instruction for all remaining unplaced uses. The rematerialization
+  // is remat_instruction. This method should be called after the HLO graph has
+  // been transformed (rematerialization instruction created and connected to
+  // uses).
+  Status AddRematerializedInstruction(HloInstruction* original_instruction,
+                                      HloInstruction* remat_instruction);
+
+  // Returns whether the given instruction has been placed (BeginInstruction
+  // has been called with 'instruction' as the argument).
+  bool IsPlaced(const HloInstruction* instruction) const {
+    return ContainsKey(placed_instructions_, instruction);
+  }
+
+  // Returns the current memory usage. This is the sum of sizes of all live
+  // values.
+  int64 memory_usage() const { return memory_usage_; }
+
+  // Returns the current instruction being placed.
+  const HloInstruction* in_progress_instruction() const {
+    return in_progress_instruction_;
+  }
+
+  // Check invariants of the data structure. This is expensive to call.
+  bool Check() const;
+
+  string ToString() const;
+
+ private:
+  // Type holding a unique identifier for each Buffer object.
+  using BufferId = int64;
+
+  // A Buffer represents a single LogicalBuffer in the computation including
+  // various metadata useful for tracking liveness of the value. A LogicalBuffer
+  // is not used directly because the HLO graph is transformed and
+  // TuplePointsToAnalysis which owns all LogicalBuffers cannot be updated after
+  // HLO graph transformations.
+  struct Buffer {
+    // The unique id of this Buffer. This value is equal to the buffer's index
+    // in the vector buffers_.
+    const BufferId id;
+
+    // The instruction which defines this buffer.
+    const HloInstruction* defining_instruction;
+
+    // The materialized size of the buffer in bytes.
+    const int64 size;
+
+    // Whether this buffer is live-out of the computation.
+    bool live_out;
+
+    // Whether this buffer has indirect uses. Ie, an instruction which is not a
+    // user of defining_instruction uses this buffer. This can occur due to
+    // buffer aliasing (eg, tuples).
+    bool has_indirect_uses;
+
+    // The instructions which use this buffer.
+    std::vector<const HloInstruction*> users;
+
+    // The number of users (HloInstructions) of this buffer which have not yet
+    // been placed in the sequence.
+    int64 unfinished_user_count;
+
+    string ToString() const {
+      return tensorflow::strings::StrCat("Buffer ", id, " (defined by ",
+                                         defining_instruction->name(),
+                                         ", size ", size, " bytes)");
     }
+  };
+
+  // Creates a Buffer representing the given logical buffer. The buffer is added
+  // to buffers_ and a reference is returned.
+  Buffer& CreateBufferFromLogicalBuffer(
+      const LogicalBuffer* logical_buffer,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const HloRematerialization::ShapeSizeFunction& size_function,
+      bool live_out) {
+    bool has_indirect_uses = false;
+    std::vector<const HloInstruction*> users =
+        GetUsers(logical_buffer, points_to_analysis, &has_indirect_uses);
+    return NewBuffer(logical_buffer->instruction(),
+                     size_function(logical_buffer->shape()), std::move(users),
+                     live_out, has_indirect_uses);
   }
 
-  // Starts the placement of the given instruction. This adds the output size of
-  // the instruction to the current memory usage. Placement is broken into two
-  // steps (BeginInstruction and EndInstruction) to accurately model memory
-  // usage. At BeginInstruction the memory for the output value of the current
-  // instruction is allocated. At EndInstruction memory for dead operands is
-  // freed.
-  Status BeginInstruction(const HloInstruction* instruction) {
-    VLOG(3) << "BeginInstruction " << instruction->name();
-    TF_RET_CHECK(in_progress_instruction_ == nullptr);
-    in_progress_instruction_ = instruction;
-
-    // Add instruction to remaining_uses_.
-    TF_RET_CHECK(!ContainsKey(remaining_uses_, instruction));
-    std::vector<HloInstruction*>& instruction_uses =
-        remaining_uses_[instruction];
-    instruction_uses.insert(instruction_uses.begin(),
-                            instruction->users().begin(),
-                            instruction->users().end());
-
-    if (!IsLiveIn(instruction)) {
-      // Instruction was not previously live so add output size to memory usage.
-      memory_usage_ += TotalSizeBytes(instruction->shape());
+  // Create a new buffer representing a rematerialization of given buffer for
+  // the given uses.
+  Buffer& RematerializeBuffer(
+      const Buffer& original_buffer, const HloInstruction* remat_instruction,
+      std::vector<const HloInstruction*>&& rematerialized_uses) {
+    CHECK(IsPlaced(original_buffer.defining_instruction));
+    CHECK(!original_buffer.has_indirect_uses);
+    CHECK(!original_buffer.live_out);
+    for (const HloInstruction* use : rematerialized_uses) {
+      CHECK(!IsPlaced(use));
     }
+    return NewBuffer(remat_instruction, original_buffer.size,
+                     std::move(rematerialized_uses), /*live_out=*/false,
+                     /*has_indirect_uses=*/false);
+  }
+
+  // Return number of bytes allocated for the buffer with the given id. Buffers
+  // allocated by the calling computation (eg, parameter and output buffers) are
+  // considered to have zero bytes because the memory is accounted for in a
+  // different computation.
+  int64 AllocatedSize(BufferId buffer_id) const {
+    const Buffer& buffer = buffers_.at(buffer_id);
+    HloOpcode def_opcode = buffer.defining_instruction->opcode();
+    if (buffer.live_out || def_opcode == HloOpcode::kParameter) {
+      return 0;
+    } else {
+      return buffer.size;
+    }
+  }
 
-    VLOG(3) << "  memory usage = " << memory_usage_;
-    VLOG(10) << ToString();
-    return Status::OK();
+  // Returns true if BeginInstruction and EndInstruction has been called for the
+  // given instruction.
+  bool IsFinished(const HloInstruction* instruction) const {
+    return IsPlaced(instruction) && instruction != in_progress_instruction_;
   }
 
-  // Finishes the placement of the current instruction. This frees any dead
-  // operands or dead result of the instruction. This must be called after each
-  // call to BeginInstruction.
-  Status EndInstruction() {
-    TF_RET_CHECK(in_progress_instruction_ != nullptr);
-    VLOG(3) << "EndInstruction " << in_progress_instruction_->name();
-
-    for (HloInstruction* operand : UniqueOperands(in_progress_instruction_)) {
-      TF_RET_CHECK(ContainsKey(remaining_uses_, operand));
-      std::vector<HloInstruction*>& uses = remaining_uses_.at(operand);
-      auto it = std::find(uses.begin(), uses.end(), in_progress_instruction_);
-      TF_RET_CHECK(it != uses.end());
-      uses.erase(it);
-
-      if (uses.empty()) {
-        // Operand is dead.
-        int64 operand_size = TotalSizeBytes(operand->shape());
-        if (!IsLiveOut(operand)) {
-          VLOG(4) << operand->name() << " ("
-                  << HumanReadableNumBytes(operand_size) << ") is dead";
-          memory_usage_ -= operand_size;
-          TF_RET_CHECK(memory_usage_ >= 0);
+  // Returns whether the given buffer is being used by the in-progress
+  // instruction.
+  bool IsInUse(BufferId buffer_id) const {
+    if (in_progress_instruction_ == nullptr) {
+      return false;
+    }
+    const std::vector<BufferId>& in_progress_uses =
+        buffers_used_by_instruction_.at(in_progress_instruction_);
+    return std::find(in_progress_uses.begin(), in_progress_uses.end(),
+                     buffer_id) != in_progress_uses.end();
+  }
+
+  // Returns whether the given instruction is live at the current program
+  // point.
+  bool IsCurrentlyLive(BufferId buffer_id) const {
+    const Buffer& buffer = buffers_[buffer_id];
+    return (IsPlaced(buffer.defining_instruction) &&
+            buffer.unfinished_user_count > 0);
+  }
+
+  // Create a new buffer, add it to buffers_, and return a reference.
+  Buffer& NewBuffer(const HloInstruction* defining_instruction, int64 size,
+                    std::vector<const HloInstruction*>&& users, bool live_out,
+                    bool has_indirect_uses) {
+    int buffer_id = buffers_.size();
+    buffers_.push_back(Buffer{buffer_id, defining_instruction, size, live_out,
+                              has_indirect_uses, users,
+                              static_cast<int64>(users.size())});
+    return buffers_.back();
+  }
+
+  const HloComputation* computation_;
+
+  // Instruction list containing the ordering of instructions in
+  // computation_. This is the order in which instructions are placed
+  // (BeginInstruction/EndInstruction calls).
+  const InstructionList& instruction_list_;
+
+  // Memory usage at the currently placed instruction.
+  int64 memory_usage_ = 0;
+
+  // The instruction currently being placed. This value is non-null only
+  // between the calling of BeginInstruction and EndInstruction.
+  const HloInstruction* in_progress_instruction_ = nullptr;
+
+  // The buffers defined by each instruction.
+  std::unordered_map<const HloInstruction*, std::vector<BufferId>>
+      buffers_defined_by_instruction_;
+
+  // The buffers used by each instruction.
+  std::unordered_map<const HloInstruction*, std::vector<BufferId>>
+      buffers_used_by_instruction_;
+
+  // The set of instructions which have been placed. That is, BeginInstruction
+  // has been called with the instruction as an argument.
+  tensorflow::gtl::FlatSet<const HloInstruction*> placed_instructions_;
+
+  // All buffers in the computation.
+  std::vector<Buffer> buffers_;
+};
+
+MemoryUsageTracker::MemoryUsageTracker(
+    const HloComputation* computation,
+    const HloRematerialization::ShapeSizeFunction& size_function,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const InstructionList& instruction_list)
+    : computation_(computation), instruction_list_(instruction_list) {
+  // Iterate through all LogicalBuffers in the computation and gather the
+  // instructions which define them in buffers_defined_by_instruction_ and the
+  // instructions which use them in buffers_used_by_instruction_.
+  for (auto& instruction : computation_->instructions()) {
+    // Initialize empty vectors for defs and uses of each instruction.
+    buffers_used_by_instruction_[instruction.get()];
+    buffers_defined_by_instruction_[instruction.get()];
+  }
+
+  tensorflow::gtl::FlatSet<const LogicalBuffer*> live_out_set =
+      points_to_analysis.GetPointsToSet(computation_->root_instruction())
+          .CreateFlattenedSet();
+  tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferId>
+      logical_buffer_to_buffer_id;
+
+  for (const HloInstruction* instruction : instruction_list_.instructions()) {
+    for (const LogicalBuffer* logical_buffer :
+         points_to_analysis.GetBuffersDefinedByInstruction(instruction)) {
+      Buffer* buffer;
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        // The while instruction defines no new buffers. Instead it reuses the
+        // buffers of its operand. Find the Buffer of its operand at the
+        // proper ShapeIndex.
+        const PointsToSet& operand_points_to =
+            points_to_analysis.GetPointsToSet(instruction->operand(0));
+        CHECK_EQ(operand_points_to.element(logical_buffer->index()).size(), 1);
+        const LogicalBuffer* source_logical_buffer =
+            operand_points_to.element(logical_buffer->index())[0];
+        buffer =
+            &buffers_.at(logical_buffer_to_buffer_id.at(source_logical_buffer));
+
+        // Mark buffer as has indirect use and live out.
+        buffer->has_indirect_uses = true;
+        buffer->live_out =
+            buffer->live_out || ContainsKey(live_out_set, logical_buffer);
+
+        // Add users of while to Buffer users.
+        bool unused;
+        for (const HloInstruction* user :
+             GetUsers(logical_buffer, points_to_analysis, &unused)) {
+          if (std::find(buffer->users.begin(), buffer->users.end(), user) ==
+              buffer->users.end()) {
+            buffer->users.push_back(user);
+            buffer->unfinished_user_count++;
+            buffers_used_by_instruction_.at(user).push_back(buffer->id);
+          }
+        }
+      } else {
+        buffer = &CreateBufferFromLogicalBuffer(
+            logical_buffer, points_to_analysis, size_function,
+            ContainsKey(live_out_set, logical_buffer));
+        buffers_defined_by_instruction_.at(instruction).push_back(buffer->id);
+        for (const HloInstruction* user : buffer->users) {
+          buffers_used_by_instruction_.at(user).push_back(buffer->id);
         }
       }
-    }
 
-    // Value is dead if the instruction has no uses and is not live out.
-    if (in_progress_instruction_->users().empty() &&
-        !IsLiveOut(in_progress_instruction_)) {
-      memory_usage_ -= TotalSizeBytes(in_progress_instruction_->shape());
-      TF_RET_CHECK(memory_usage_ >= 0);
+      logical_buffer_to_buffer_id[logical_buffer] = buffer->id;
     }
+  }
+  XLA_VLOG_LINES(10, ToString());
+  DCHECK(Check());
+}
+
+Status MemoryUsageTracker::BeginInstruction(const HloInstruction* instruction) {
+  VLOG(3) << "BeginInstruction " << instruction->name();
+  TF_RET_CHECK(in_progress_instruction_ == nullptr);
+  in_progress_instruction_ = instruction;
 
-    in_progress_instruction_ = nullptr;
+  placed_instructions_.insert(in_progress_instruction_);
 
-    VLOG(3) << "  memory usage = " << memory_usage_;
-    VLOG(10) << ToString();
-    return Status::OK();
+  // All buffers defined by this instruction need memory.
+  for (BufferId buffer_id : buffers_defined_by_instruction_.at(instruction)) {
+    VLOG(3) << "  Buffer " << buffers_.at(buffer_id).ToString()
+            << " is now live.";
+    memory_usage_ += AllocatedSize(buffer_id);
   }
 
-  // Adjusts memory usage to account for the rematerialization of
-  // original_instruction for the given use. The rematerialization is
-  // remat_instruction. This method should be called after the HLO graph has
-  // been transformed (rematerialization instruction created and connected to
-  // its use).
-  Status RematerializeInstructionForUse(HloInstruction* original_instruction,
-                                        HloInstruction* remat_instruction,
-                                        HloInstruction* use) {
-    VLOG(3) << "RematerializeInstructionForUse: original_instruction = "
-            << original_instruction->name()
-            << ", remat_instruction = " << remat_instruction->name()
-            << ", use = " << use->name();
-
-    TF_RET_CHECK(in_progress_instruction_ != nullptr);
-    TF_RET_CHECK(IsPlaced(original_instruction));
-    TF_RET_CHECK(!IsPlaced(remat_instruction));
-    TF_RET_CHECK(!IsPlaced(use));
-    TF_RET_CHECK(IsCurrentlyLive(original_instruction));
-
-    // Remove 'use' from remaining uses of original_instruction.
-    auto it = std::find(remaining_uses_[original_instruction].begin(),
-                        remaining_uses_[original_instruction].end(), use);
-    TF_RET_CHECK(it != remaining_uses_[original_instruction].end());
-    remaining_uses_[original_instruction].erase(it);
-
-    // If original_instruction is no longer live ('use' was its last use) then
-    // deduct original_instruction's memory usage.
-    if (!IsCurrentlyLive(original_instruction)) {
-      memory_usage_ -= TotalSizeBytes(original_instruction->shape());
-      TF_RET_CHECK(memory_usage_ >= 0);
+  // TODO(b/37686934): Elementwise instructions can share the buffer of a (dead)
+  // operand. Account for this potential reuse here.
+
+  VLOG(3) << "  memory usage = " << memory_usage_;
+  VLOG(10) << ToString();
+
+  DCHECK(Check());
+  return Status::OK();
+}
+
+Status MemoryUsageTracker::EndInstruction() {
+  TF_RET_CHECK(in_progress_instruction_ != nullptr);
+  VLOG(3) << "EndInstruction " << in_progress_instruction_->name();
+
+  for (BufferId buffer_id :
+       buffers_used_by_instruction_.at(in_progress_instruction_)) {
+    Buffer& buffer = buffers_.at(buffer_id);
+    buffer.unfinished_user_count--;
+    CHECK_GE(buffer.unfinished_user_count, 0)
+        << buffer.ToString() << " has negative unfinished use count.";
+    if (buffer.unfinished_user_count == 0) {
+      // Buffer is now dead.
+      VLOG(3) << "  " << buffer.ToString() << " is now dead.";
+      memory_usage_ -= AllocatedSize(buffer_id);
+      CHECK_GE(memory_usage_, 0);
     }
+  }
 
-    // Add the new remat_instruction to the remaining uses of its operands.
-    for (auto* operand : UniqueOperands(remat_instruction)) {
-      // Rematerialization may extend the lifetime of the operand so account for
-      // this in memory_usage_.
-      TF_RET_CHECK(IsPlaced(operand));
-      if (!IsCurrentlyLive(operand)) {
-        memory_usage_ += TotalSizeBytes(operand->shape());
-      }
-      remaining_uses_.at(operand).push_back(remat_instruction);
+  // If any buffer defined by this instruction has no uses, then memory can be
+  // reclaimed immediately.
+  for (BufferId buffer_id :
+       buffers_defined_by_instruction_.at(in_progress_instruction_)) {
+    const Buffer& buffer = buffers_.at(buffer_id);
+    if (buffer.unfinished_user_count == 0) {
+      VLOG(3) << "  " << buffer.ToString() << " is immediately dead.";
+      memory_usage_ -= AllocatedSize(buffer_id);
+      CHECK_GE(memory_usage_, 0);
     }
+  }
+
+  in_progress_instruction_ = nullptr;
+
+  VLOG(3) << "  memory usage = " << memory_usage_;
+  VLOG(10) << ToString();
+
+  DCHECK(Check());
 
-    VLOG(3) << "  memory usage = " << memory_usage_;
-    VLOG(10) << ToString();
-    return Status::OK();
+  return Status::OK();
+}
+
+int64 MemoryUsageTracker::MemoryReducedIfRematerialized(
+    const HloInstruction* instruction) const {
+  CHECK_NE(in_progress_instruction_, nullptr);
+  if (!IsPlaced(instruction) || instruction == in_progress_instruction_) {
+    return 0;
   }
 
-  // Returns the number of bytes that the current memory usage will be reduced
-  // if the given instruction is rematerialized.
-  int64 MemoryReducedIfRematerialized(const HloInstruction* instruction) const {
-    // To reduce memory consumption 'instruction' must be currently live and
-    // rematerialization must make 'instruction' not live.
-    if (IsLiveIn(instruction) || IsLiveOut(instruction) ||
-        !IsCurrentlyLive(instruction)) {
+  // TODO(b/37687140): Rematerialization can increase peak memory consumption at
+  // an earlier point in the program if rematerialization extends the live range
+  // of the operand of the instruction being rematerialized across the live
+  // range of the value of instruction being rematerialized. Don't rematerialize
+  // in this case (ie, return 0 here).
+
+  // Compute the amount of memory reduced (if any) by rematerializing
+  // 'instruction'. The LogicalBuffers defined by 'instruction' will no longer
+  // be live at this program point, so initially set memory_reduced to the
+  // size of its defined values.
+  int64 memory_reduced = 0;
+  for (BufferId buffer_id : buffers_defined_by_instruction_.at(instruction)) {
+    // Avoid rematerializing instructions with indirect uses as it is difficult
+    // to reason about liveness after rematerializing the instruction.
+    // TODO(b/37714814): Consider rematerialzing instructions with indirect
+    // uses.
+    if (buffers_.at(buffer_id).has_indirect_uses) {
       return 0;
     }
 
-    // If the in-progress instruction is a user of 'instruction' (or
-    // 'instruction' itself) then rematerializing 'instruction' cannot reduce
-    // memory usage because the value is required to be live at this program
-    // point.
-    if (in_progress_instruction_ == instruction ||
-        in_progress_instruction_->IsUserOf(instruction)) {
-      return 0;
+    if (IsCurrentlyLive(buffer_id) && !IsInUse(buffer_id)) {
+      memory_reduced += AllocatedSize(buffer_id);
     }
+  }
 
-    // Compute the amount of memory reduced (if any) by rematerializing
-    // 'instruction'. 'instruction' will no longer be live at this program
-    // point, so initially set memory_reduced to the size of its output value.
-    int64 memory_reduced = TotalSizeBytes(instruction->shape());
-
-    // Account for any operands whose live range must be extended across this
-    // program point.
-    for (const HloInstruction* operand : UniqueOperands(instruction)) {
-      if (!IsCurrentlyLive(operand)) {
-        // This operand of candidate is not live at this program
-        // point. Rematerializing 'instruction' will extend the operand's live
-        // range across this program point.
-        memory_reduced -= TotalSizeBytes(operand->shape());
-      }
+  // Account for any logical buffers whose live range must be extended across
+  // this program point.
+  for (BufferId buffer_id : buffers_used_by_instruction_.at(instruction)) {
+    if (!IsCurrentlyLive(buffer_id)) {
+      // This logical buffer is used by 'instruction' but is not live at this
+      // program point. Rematerializing 'instruction' will extend the buffer's
+      // live range across this program point.
+      memory_reduced -= AllocatedSize(buffer_id);
     }
-    return memory_reduced;
   }
 
-  // Returns the remaining unplaced uses of the given instruction.
-  const std::vector<HloInstruction*>& RemainingUses(
-      const HloInstruction* instruction) const {
-    return remaining_uses_.at(instruction);
+  return memory_reduced;
+}
+
+Status MemoryUsageTracker::AddRematerializedInstruction(
+    HloInstruction* original_instruction, HloInstruction* remat_instruction) {
+  VLOG(3) << "AddRematerializedInstruction: original_instruction = "
+          << original_instruction->name()
+          << ", remat_instruction = " << remat_instruction->name();
+
+  TF_RET_CHECK(in_progress_instruction_ != nullptr);
+  TF_RET_CHECK(IsPlaced(original_instruction));
+  TF_RET_CHECK(!IsPlaced(remat_instruction));
+  CHECK(!ContainsKey(buffers_defined_by_instruction_, remat_instruction));
+  CHECK(!ContainsKey(buffers_used_by_instruction_, remat_instruction));
+
+  // Construct the list of buffers used and defined by the rematerialization.
+  buffers_defined_by_instruction_[remat_instruction];
+  buffers_used_by_instruction_[remat_instruction] =
+      buffers_used_by_instruction_.at(original_instruction);
+
+  // Account for the additional buffer uses created by the new rematerialization
+  // instruction. Update memory usage if the rematerialization makes a dead
+  // buffer live again.
+  for (BufferId buffer_id :
+       buffers_used_by_instruction_.at(original_instruction)) {
+    Buffer& buffer = buffers_.at(buffer_id);
+    if (buffer.unfinished_user_count == 0) {
+      // Buffer used by this instruction was dead, now is alive.
+      memory_usage_ += AllocatedSize(buffer.id);
+    }
+
+    buffer.unfinished_user_count++;
+    buffer.users.push_back(remat_instruction);
   }
 
-  // Returns whether the given instruction has been placed (BeginInstruction has
-  // been called with 'instruction' as the argument).
-  bool IsPlaced(const HloInstruction* instruction) const {
-    return ContainsKey(remaining_uses_, instruction);
-  }
-
-  // Returns whether the given instruction is live at the current program point.
-  bool IsCurrentlyLive(const HloInstruction* instruction) const {
-    return (!IsPlaced(instruction) && IsLiveIn(instruction)) ||
-           (IsPlaced(instruction) &&
-            (!RemainingUses(instruction).empty() || IsLiveOut(instruction)));
-  }
-
-  string ToString() const {
-    string output = tensorflow::strings::StrCat("MemoryUsageTracker for ",
-                                                computation_->name(), "\n");
-    tensorflow::strings::StrAppend(&output, "memory usage = ", memory_usage(),
-                                   "\n");
-    tensorflow::strings::StrAppend(&output, "Live values:\n");
-    for (const auto& pair : remaining_uses_) {
-      const HloInstruction* instruction = pair.first;
-      const std::vector<HloInstruction*>& uses = pair.second;
-      tensorflow::strings::StrAppend(
-          &output, "  ", instruction->name(), "; remaining uses: ",
-          tensorflow::str_util::Join(uses, ", ",
-                                     [](string* out, HloInstruction* use) {
-                                       tensorflow::strings::StrAppend(
-                                           out, use->name());
-                                     }),
-          "\n");
+  // Create a new set of Buffers defined by the new rematerialization
+  // instruction. Update the internal data structures and memory use to account
+  // for them.
+  for (BufferId old_buffer_id :
+       buffers_defined_by_instruction_.at(original_instruction)) {
+    Buffer& old_buffer = buffers_.at(old_buffer_id);
+
+    std::vector<const HloInstruction*> placed_users;
+    std::vector<const HloInstruction*> unplaced_users;
+    for (const HloInstruction* user : old_buffer.users) {
+      if (IsPlaced(user)) {
+        CHECK(IsFinished(user));
+        placed_users.push_back(user);
+      } else {
+        unplaced_users.push_back(user);
+      }
+    }
+    old_buffer.users = std::move(placed_users);
+    old_buffer.unfinished_user_count = 0;
+
+    // Buffer is now dead.
+    memory_usage_ -= AllocatedSize(old_buffer.id);
+
+    Buffer& new_buffer = RematerializeBuffer(old_buffer, remat_instruction,
+                                             std::move(unplaced_users));
+
+    buffers_defined_by_instruction_.at(remat_instruction)
+        .push_back(new_buffer.id);
+    for (const HloInstruction* user : new_buffer.users) {
+      std::vector<BufferId>& buffers_used =
+          buffers_used_by_instruction_.at(user);
+      std::replace(buffers_used.begin(), buffers_used.end(), old_buffer_id,
+                   new_buffer.id);
     }
-    return output;
   }
 
-  // Returns the current memory usage. This is the sum of sizes of all live
-  // values.
-  int64 memory_usage() const { return memory_usage_; }
+  VLOG(3) << "  memory usage = " << memory_usage_;
+  XLA_VLOG_LINES(10, ToString());
 
-  // Returns the current instruction being placed.
-  const HloInstruction* in_progress_instruction() const {
-    return in_progress_instruction_;
-  }
+  DCHECK(Check());
 
- private:
-  // Returns the total size of the shape (including nested elements) in bytes.
-  int64 TotalSizeBytes(const Shape& shape) const {
-    int64 total_size = 0;
-    ShapeUtil::ForEachSubshape(
-        shape,
-        [this, &total_size](const Shape& subshape,
-                            const ShapeIndex& /*index*/) {
-          total_size += size_function_(subshape);
-          return Status::OK();
-        })
-        .IgnoreError();
-    return total_size;
-  }
-
-  // Returns true if the value of given instruction is live into the
-  // computation.
-  bool IsLiveIn(const HloInstruction* instruction) const {
-    return instruction->opcode() == HloOpcode::kConstant ||
-           instruction->opcode() == HloOpcode::kParameter;
-  }
-
-  // Returns true if the value of given instruction is live out of the
-  // computation.
-  bool IsLiveOut(const HloInstruction* instruction) const {
-    return instruction->opcode() == HloOpcode::kConstant ||
-           instruction->opcode() == HloOpcode::kParameter ||
-           instruction == instruction->parent()->root_instruction();
+  return Status::OK();
+}
+
+string MemoryUsageTracker::ToString() const {
+  string output = tensorflow::strings::StrCat("MemoryUsageTracker for ",
+                                              computation_->name(), "\n");
+  tensorflow::strings::StrAppend(
+      &output, "Memory usage: ", HumanReadableNumBytes(memory_usage()), " (",
+      memory_usage(), " bytes)");
+  for (const HloInstruction* instruction : instruction_list_.instructions()) {
+    string inprogress =
+        instruction == in_progress_instruction_ ? " in-progress" : "";
+    string placed = IsPlaced(instruction) ? " placed" : "";
+    tensorflow::strings::StrAppend(&output, "  ", instruction->name(),
+                                   inprogress, placed, "\n    Defines:\n");
+    for (BufferId buffer_id : buffers_defined_by_instruction_.at(instruction)) {
+      const Buffer& buffer = buffers_[buffer_id];
+      string live = IsCurrentlyLive(buffer_id) ? " live" : "";
+      tensorflow::strings::StrAppend(&output, "      ", buffer.ToString(), live,
+                                     ", ", buffer.unfinished_user_count,
+                                     " unfinished uses\n");
+    }
+    tensorflow::strings::StrAppend(&output, "    Uses:\n");
+    for (BufferId buffer_id : buffers_used_by_instruction_.at(instruction)) {
+      tensorflow::strings::StrAppend(&output, "      ",
+                                     buffers_[buffer_id].ToString(), "\n");
+    }
   }
+  return output;
+}
 
-  const HloComputation* computation_;
+bool MemoryUsageTracker::Check() const {
+  auto elements_are_unique = [](const std::vector<BufferId>& vec) {
+    return vec.size() == std::set<BufferId>(vec.begin(), vec.end()).size();
+  };
+
+  // Verify buffers_defined_by_instruction_.
+  for (auto& instruction : computation_->instructions()) {
+    const std::vector<BufferId>& defined_buffers =
+        buffers_defined_by_instruction_.at(instruction.get());
+    CHECK(elements_are_unique(defined_buffers))
+        << "Instruction " << instruction->name()
+        << " does not have unique defined buffers: "
+        << tensorflow::str_util::Join(
+               defined_buffers, ", ", [this](string* out, BufferId buffer_id) {
+                 tensorflow::strings::StrAppend(
+                     out, buffers_.at(buffer_id).ToString());
+               });
 
-  // Function which computes the size of the top-level buffer of a shape.
-  const HloRematerialization::ShapeSizeFunction size_function_;
+    for (const Buffer& buffer : buffers_) {
+      if (buffer.defining_instruction == instruction.get()) {
+        CHECK(std::find(defined_buffers.begin(), defined_buffers.end(),
+                        buffer.id) != defined_buffers.end())
+            << "Instruction " << instruction->name()
+            << " defined buffers is missing: " << buffer.ToString();
+      }
+    }
+  }
 
-  // Memory usage at the currently placed instruction.
-  int64 memory_usage_ = 0;
+  // Verify buffers_used_by_instruction_.
+  for (auto& instruction : computation_->instructions()) {
+    const std::vector<BufferId>& used_buffers =
+        buffers_used_by_instruction_.at(instruction.get());
+    CHECK(elements_are_unique(used_buffers))
+        << "Instruction " << instruction->name()
+        << " does not have unique used buffers: "
+        << tensorflow::str_util::Join(
+               used_buffers, ", ", [this](string* out, BufferId buffer_id) {
+                 tensorflow::strings::StrAppend(
+                     out, buffers_.at(buffer_id).ToString());
+               });
+  }
+  for (const Buffer& buffer : buffers_) {
+    int64 unfinished_uses = 0;
+    for (const HloInstruction* user : buffer.users) {
+      const std::vector<BufferId>& used_buffers =
+          buffers_used_by_instruction_.at(user);
+      CHECK(std::find(used_buffers.begin(), used_buffers.end(), buffer.id) !=
+            used_buffers.end())
+          << "Instruction " << user->name() << " used buffers is missing "
+          << buffer.ToString();
+      if (!IsFinished(user)) {
+        unfinished_uses++;
+      }
+    }
+    CHECK_EQ(buffer.unfinished_user_count, unfinished_uses)
+        << "Incorrect unplaced use count for " << buffer.ToString();
+  }
 
-  // The instruction currently being placed. This value is non-null only between
-  // the calling of BeginInstruction and EndInstruction.
-  const HloInstruction* in_progress_instruction_ = nullptr;
+  // Verify live set size against memory_usage_.
+  int64 live_size = 0;
+  for (const Buffer& buffer : buffers_) {
+    // The while instruction reuses its input buffers as output buffers so
+    // don't double count its buffers if it is currently executing.
+    if (IsCurrentlyLive(buffer.id) &&
+        !(buffer.defining_instruction == in_progress_instruction_ &&
+          in_progress_instruction_->opcode() == HloOpcode::kWhile)) {
+      live_size += AllocatedSize(buffer.id);
+    }
+  }
+  CHECK_EQ(live_size, memory_usage_);
 
-  // remaining_uses is a vector of uses of the HLO instruction's value which
-  // have not yet been visited by in the rematerialization loop. Use to track
-  // liveness of HLO instructions.
-  // TODO(b/35212854): Track values using logical buffers rather than HLO
-  // instructions. Using HLO instructions over-estimates memory usage because
-  // buffer aliasing is ignored.
-  tensorflow::gtl::FlatMap<const HloInstruction*, std::vector<HloInstruction*>>
-      remaining_uses_;
-};
+  return true;
+}
 
-// Computes and returns the cost of rematerializing the given instruction. Cost
-// per rematerialized instruction is defined as:
+// Computes and returns the cost of rematerializing the given instruction.
+// Cost per rematerialized instruction is defined as:
 //
 // (flop_count + transcendental_count + element_count) / memory_reduced
 //
@@ -425,33 +808,36 @@ class MemoryUsageTracker {
 //     instruction.
 //
 // This is a rough estimate of the extra execution time per byte saved by
-// rematerializing this instruction for its remaining uses. In general, we want
-// the most memory saving for the least latency penalty which is captured by
-// this heuristic.
+// rematerializing this instruction for its remaining uses. In general, we
+// want the most memory saving for the least latency penalty which is captured
+// by this heuristic.
 int64 RematerializationCost(const HloInstruction* instruction,
                             const MemoryUsageTracker& memory_tracker,
                             const HloCostAnalysis& cost_analysis,
                             int64 memory_reduced) {
-  const int64 bytes_accessed = cost_analysis.bytes_accessed(*instruction);
-  const int64 elements_accessed =
-      bytes_accessed /
-      ShapeUtil::ByteSizeOfPrimitiveType(instruction->shape().element_type());
-
-  // A duplicate of the rematerialized instruction will be created at each
-  // remaining use.
-  int64 duplication = memory_tracker.RemainingUses(instruction).size();
-  if (duplication == instruction->users().size()) {
-    // All remaining uses of instruction are after this point so we can remove
-    // the original instruciton after rematerialization.
-    duplication -= 1;
+  // If none of the users of 'instruction' have been placed in the sequence (as
+  // tracked by memory_tracker), then rematerialization of 'instruction' is a
+  // zero-cost move of 'instruction' in the sequence.
+  if (!std::any_of(instruction->users().begin(), instruction->users().end(),
+                   [&memory_tracker](const HloInstruction* inst) {
+                     return memory_tracker.IsPlaced(inst);
+                   })) {
+    return 0;
   }
+
   CHECK_GT(memory_reduced, 0);
+  const int64 bytes_accessed = cost_analysis.bytes_accessed(*instruction);
+  const int64 elements_accessed =
+      ShapeUtil::IsTuple(instruction->shape())
+          ? bytes_accessed
+          : bytes_accessed / ShapeUtil::ByteSizeOfPrimitiveType(
+                                 instruction->shape().element_type());
 
   // Multiply by 256 to improve precision of cost. Without this factor,
   // many instructions such as many elementwise instructions would have
   // zero cost because the bytes reduced can be several times greater than
   // the element count.
-  return 256 * duplication *
+  return 256 *
          (cost_analysis.flop_count(*instruction) +
           cost_analysis.transcendental_count(*instruction) +
           elements_accessed) /
@@ -467,7 +853,7 @@ HloInstruction* PickRematerializationCandidate(
     const MemoryUsageTracker& memory_tracker,
     const InstructionList& instruction_list,
     const HloCostAnalysis& cost_analysis,
-    const tensorflow::gtl::FlatSet<const HloInstruction*>& remat_instructions) {
+    const tensorflow::gtl::FlatSet<const HloInstruction*>& blacklist) {
   HloInstruction* best = nullptr;
   int64 best_cost = 0;
 
@@ -482,11 +868,11 @@ HloInstruction* PickRematerializationCandidate(
     }
     VLOG(5) << "considering rematerialization candidate " << candidate->name();
 
-    if (ContainsKey(remat_instructions, candidate)) {
-      // Skip instructions which are rematerialization clones to avoid infinite
-      // loops of rematerializing the same instruction(s) repeatedly.
+    if (ContainsKey(blacklist, candidate)) {
+      // Skip instructions on the blacklist to avoid infinite loops of
+      // rematerializing the same instruction(s) repeatedly.
       VLOG(5) << "candidate " << candidate->name()
-              << " not viable: is a rematerialized instruction";
+              << " is excluded from rematerialization";
       continue;
     }
 
@@ -525,7 +911,9 @@ HloInstruction* PickRematerializationCandidate(
 StatusOr<int64> HloRematerialization::ComputePeakMemory(
     const HloComputation* computation,
     const std::vector<const HloInstruction*>& order) const {
-  MemoryUsageTracker tracker(computation, size_function_);
+  InstructionList instruction_list(order);
+  MemoryUsageTracker tracker(computation, size_function_, *points_to_analysis_,
+                             instruction_list);
   int64 peak_memory = tracker.memory_usage();
   for (const HloInstruction* instruction : order) {
     TF_RETURN_IF_ERROR(tracker.BeginInstruction(instruction));
@@ -542,9 +930,8 @@ StatusOr<int64> HloRematerialization::ComputePeakMemory(
 
 StatusOr<int64> HloRematerialization::CalledComputationsMemoryUsage(
     const HloInstruction* instruction) const {
-  TF_ASSIGN_OR_RETURN(const CallGraphNode* node,
-                      call_graph_->GetNode(instruction->parent()));
-  const CallSite* callsite = node->GetCallSite(instruction);
+  const CallSite* callsite =
+      call_graph_->GetNode(instruction->parent()).GetCallSite(instruction);
   if (callsite == nullptr || callsite->context() == CallContext::kParallel) {
     return 0;
   }
@@ -564,15 +951,24 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
           << " with limit " << HumanReadableNumBytes(memory_limit_bytes);
   VLOG(1) << "peak memory usage is "
           << HumanReadableNumBytes(computation_peak_memory_.at(computation));
+  CHECK(!ContainsKey(rematerialized_computations_, computation));
 
   InstructionList instruction_list(sequence->at(computation));
-  MemoryUsageTracker memory_tracker(computation, size_function_);
+  MemoryUsageTracker memory_tracker(computation, size_function_,
+                                    *points_to_analysis_, instruction_list);
   bool changed = false;
 
-  // Set of instruction clones (not the originals) created during
-  // rematerialization. A record is kept to avoid rematerializing an instruction
-  // more than once to avoid looping infinitely during rematerialization.
-  tensorflow::gtl::FlatSet<const HloInstruction*> remat_instructions;
+  // To avoid an infinite loop rematerializing the same set of instructions ad
+  // infinitum, keep a blacklist of instructions which should not be
+  // rematerialized.
+  tensorflow::gtl::FlatSet<const HloInstruction*> blacklist;
+
+  // If the rematerialization makes the source instruction dead, then the
+  // rematerialization is added to 'remat_move_instructions' (the
+  // rematerialization is essentially a move). If the next rematerialization of
+  // the instruction is also a move then the rematerialization is added to the
+  // blacklist.
+  tensorflow::gtl::FlatSet<const HloInstruction*> remat_move_instructions;
 
   // The peak memory of the computation at any point in the instruction
   // sequence.
@@ -584,12 +980,12 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   // instructions which are dead.
   int64 net_instructions_added = 0;
 
-  TF_ASSIGN_OR_RETURN(const CallGraphNode* call_graph_node,
-                      call_graph_->GetNode(computation));
+  const CallGraphNode& call_graph_node = call_graph_->GetNode(computation);
 
   // Iterate through all instructions in the sequence. At each instruction
   // (program point) if memory_usage exceeds the specified limit then
   // rematerialize HLO instructions until memory_usage is reduced.
+  int64 instruction_index = 0;
   for (auto list_it = instruction_list.instructions().begin();
        list_it != instruction_list.instructions().end(); ++list_it) {
     HloInstruction* instruction = *list_it;
@@ -599,7 +995,9 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
     VLOG(2) << "Program point at " << instruction->name()
             << ", memory usage = " << memory_tracker.memory_usage()
-            << ", callee usage = " << callee_usage;
+            << ", callee usage = " << callee_usage << ", [" << instruction_index
+            << "/" << instruction_list.instructions().size() << "]";
+    instruction_index++;
 
     while (memory_tracker.memory_usage() + callee_usage > memory_limit_bytes) {
       VLOG(2) << "Over memory limit at instruction " << instruction->name()
@@ -609,7 +1007,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
               << ", limit is " << HumanReadableNumBytes(memory_limit_bytes);
 
       HloInstruction* best = PickRematerializationCandidate(
-          memory_tracker, instruction_list, cost_analysis_, remat_instructions);
+          memory_tracker, instruction_list, cost_analysis_, blacklist);
 
       if (best == nullptr) {
         VLOG(3) << "Unable to find rematerialization candidate at program "
@@ -620,44 +1018,42 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
         break;
       }
 
-      VLOG(1) << "Rematerializing instruction " << best->name();
+      VLOG(1) << "Rematerializing instruction " << best->name() << " (saving "
+              << memory_tracker.MemoryReducedIfRematerialized(best) << ")";
       changed = true;
       remat_count++;
 
-      // Create a rematerialized copy of the candidate at each remaining use.
-      // Make a copy of remaining uses because RematerializeInstructionForUse
-      // modifies the remaining uses vector in memory_tracker.
-      // TODO(b/35213652): It may be profitable to share one rematerialized copy
-      // amongst more than one use.
-      std::vector<HloInstruction*> remaining_uses_copy =
-          memory_tracker.RemainingUses(best);
-      for (HloInstruction* use : remaining_uses_copy) {
-        // Create a new rematerialized instruction in the HLO graph.
-        HloInstruction* remat =
-            computation->AddInstruction(best->Clone(/*suffix=*/"remat"));
-
-        VLOG(3) << "Replacing use of " << best->name() << " in " << use->name()
-                << " with rematerialization " << remat->name();
+      HloInstruction* remat =
+          computation->AddInstruction(best->Clone(/*suffix=*/"remat"));
 
-        TF_RETURN_IF_ERROR(best->ReplaceUseWith(use, remat));
-
-        // Account for the rematerialization in the memory tracker.
-        TF_RETURN_IF_ERROR(
-            memory_tracker.RematerializeInstructionForUse(best, remat, use));
-
-        // Insert rematerialized instruction right before its use.
-        TF_RETURN_IF_ERROR(instruction_list.InsertBefore(remat, use));
-
-        // Add rematerialized instruction to remat_instructions so the
-        // rematerialized instruction is not rematerialized again.
-        remat_instructions.insert(remat);
-
-        net_instructions_added++;
+      // Replace each remaining use of 'best' with the rematerialization.
+      std::vector<HloInstruction*> best_users_copy = best->users();
+      for (HloInstruction* user : best_users_copy) {
+        if (!memory_tracker.IsPlaced(user)) {
+          VLOG(2) << "  Replacing use of " << best->name() << " in "
+                  << user->name() << " with " << remat->name();
+          TF_RETURN_IF_ERROR(best->ReplaceUseWith(user, remat));
+        }
       }
 
-      // Original instruction should no longer be live at this point. All
-      // of its remaining uses are fed by rematerialized instructions.
-      TF_RET_CHECK(!memory_tracker.IsCurrentlyLive(best));
+      // Account for the rematerialization in the memory tracker.
+      TF_RETURN_IF_ERROR(
+          memory_tracker.AddRematerializedInstruction(best, remat));
+
+      // Insert rematerialized instruction right before the earliest unplaced
+      // use of the instruction *and* the earliest unplaced last use of any
+      // operands of remat. Unplaced uses of the remat's operands are included
+      // because we don't want to extend the live range of remat's operands as
+      // this could increase memory usage.
+      std::vector<HloInstruction*> place_before = remat->users();
+      for (auto* operand : remat->operands()) {
+        for (auto* operand_user : operand->users()) {
+          if (!memory_tracker.IsPlaced(operand_user) && operand_user != remat) {
+            place_before.push_back(operand_user);
+          }
+        }
+      }
+      instruction_list.InsertBeforeInstructions(remat, place_before);
 
       // If the rematerialized instruction is dead then rematerialization is
       // essentially a move. Don't delete the instruction now because we don't
@@ -665,15 +1061,24 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
       // transformation because we keep maps with HloInstruction* values as
       // keys.
       if (best->users().empty()) {
-        VLOG(3) << best->name() << " is now dead";
-        net_instructions_added--;
+        VLOG(2) << best->name() << " is now dead";
+        if (ContainsKey(remat_move_instructions, best)) {
+          // Previously, 'best' was a rematerialization which killed the
+          // instruction it was a copying of. Now 'remat' is a rematerialization
+          // of 'best' and kills 'best'. Stop rematerializing this instruction
+          // to avoid an infinite loop.
+          blacklist.insert(remat);
+        }
+        remat_move_instructions.insert(remat);
+      } else {
+        net_instructions_added++;
       }
 
       VLOG(3) << "memory_usage after rematerialization = "
               << memory_tracker.memory_usage();
     }
 
-    const CallSite* callsite = call_graph_node->GetCallSite(instruction);
+    const CallSite* callsite = call_graph_node.GetCallSite(instruction);
     if (callsite != nullptr &&
         callsite->context() == CallContext::kSequential &&
         memory_tracker.memory_usage() + callee_usage > memory_limit_bytes) {
@@ -687,21 +1092,22 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
       // Recompute callee usage to account for any rematerialization performed
       // in the callee computations.
-      callee_usage = 0;
       for (HloComputation* called_computation :
            callsite->called_computations()) {
-        // Memory limit for the subcomputation is the memory limit less the
-        // amount of memory used at this point in the computation.
-        int64 subcomputation_memory_limit_bytes = std::max<int64>(
-            0, memory_limit_bytes - memory_tracker.memory_usage());
-        TF_ASSIGN_OR_RETURN(
-            bool subcomputation_changed,
-            RematerializeComputation(called_computation, sequence,
-                                     subcomputation_memory_limit_bytes));
-        changed |= subcomputation_changed;
-
-        callee_usage += computation_peak_memory_.at(called_computation);
+        if (!ContainsKey(rematerialized_computations_, called_computation)) {
+          // Memory limit for the subcomputation is the memory limit less the
+          // amount of memory used at this point in the computation.
+          int64 subcomputation_memory_limit_bytes = std::max<int64>(
+              0, memory_limit_bytes - memory_tracker.memory_usage());
+          TF_ASSIGN_OR_RETURN(
+              bool subcomputation_changed,
+              RematerializeComputation(called_computation, sequence,
+                                       subcomputation_memory_limit_bytes));
+          changed |= subcomputation_changed;
+        }
       }
+      TF_ASSIGN_OR_RETURN(callee_usage,
+                          CalledComputationsMemoryUsage(instruction));
     }
 
     peak_memory = std::max<int64>(peak_memory,
@@ -711,37 +1117,33 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
     TF_RETURN_IF_ERROR(memory_tracker.EndInstruction());
   }
 
-  if (peak_memory > memory_limit_bytes) {
-    LOG(WARNING) << "Can't reduce memory use of computation "
-                 << computation->name() << " below "
-                 << HumanReadableNumBytes(memory_limit_bytes)
-                 << " by rematerialization (only reduced to "
-                 << HumanReadableNumBytes(peak_memory) << ")";
-  }
-
-  // Verify that there are no more remaining uses.
+  // Verify some invariants on the memory tracker.
+  CHECK_EQ(memory_tracker.memory_usage(), 0);
   for (auto& instruction : computation->instructions()) {
-    auto& remaining_uses = memory_tracker.RemainingUses(instruction.get());
-    CHECK(remaining_uses.empty())
-        << instruction->name() << " has remaining uses: "
-        << tensorflow::str_util::Join(
-               remaining_uses, ", ", [](string* out, HloInstruction* inst) {
-                 tensorflow::strings::StrAppend(out, inst->name());
-               });
+    CHECK(memory_tracker.IsPlaced(instruction.get()));
   }
 
-  VLOG(1) << "Rematerialized " << remat_count << " instructions; "
-          << net_instructions_added << " net instructions added";
-  VLOG(1) << "peak memory usage now " << HumanReadableNumBytes(peak_memory);
+  VLOG(1) << "In computation " << computation->name() << " rematerialized "
+          << remat_count << " instructions; " << net_instructions_added
+          << " net instructions added";
+  VLOG(1) << "  peak memory usage now " << HumanReadableNumBytes(peak_memory)
+          << " (was "
+          << HumanReadableNumBytes(computation_peak_memory_.at(computation))
+          << ")";
 
   // Update peak memory used by computation.
-  computation_peak_memory_[computation] = peak_memory;
+  computation_peak_memory_.at(computation) = peak_memory;
 
   // Update order to include rematerialized instructions.
   sequence->at(computation)
       .assign(instruction_list.instructions().begin(),
               instruction_list.instructions().end());
 
+  rematerialized_computations_.insert(computation);
+
+  instructions_rematerialized_ += remat_count;
+  net_instructions_added_ += net_instructions_added;
+
   return changed;
 }
 
@@ -754,6 +1156,28 @@ StatusOr<bool> HloRematerialization::Run(
   VLOG(1) << "HloRematerialization() with memory limit of "
           << HumanReadableNumBytes(memory_limit_bytes);
 
+  TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module));
+
+  // Adjust memory limit to account for the output of the entry
+  // computation. This is necessary because the per-computation accounting in
+  // MemoryUsageTracker do not include output as these are typically allocated
+  // by the caller.
+  int64 module_output_size = 0;
+  ShapeUtil::ForEachSubshape(
+      module->entry_computation()->root_instruction()->shape(),
+      [&module_output_size, this](const Shape& subshape,
+                                  const ShapeIndex& /*index*/) {
+        module_output_size += size_function_(subshape);
+        return Status::OK();
+      })
+      .IgnoreError();
+
+  const int64 adjusted_memory_limit_bytes =
+      memory_limit_bytes - module_output_size;
+  VLOG(1) << "Adjusted memory limit accounting for output ("
+          << HumanReadableNumBytes(module_output_size)
+          << "): " << HumanReadableNumBytes(adjusted_memory_limit_bytes);
+
   XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString());
   // Create initial sequence of HLO instructions.
   TF_ASSIGN_OR_RETURN(*sequence,
@@ -761,10 +1185,9 @@ StatusOr<bool> HloRematerialization::Run(
                           *module, [this](const LogicalBuffer& buffer) {
                             return size_function_(buffer.shape());
                           }));
-
   // Compute peak memory usage of all computations in the module called in a
   // sequential context.
-  TF_ASSIGN_OR_RETURN(call_graph_, CallGraph::Build(module));
+  call_graph_ = CallGraph::Build(module);
   TF_RETURN_IF_ERROR(call_graph_->VisitNodes(
       [this, sequence](const CallGraphNode& node) -> Status {
         if (node.context() == CallContext::kSequential) {
@@ -776,9 +1199,15 @@ StatusOr<bool> HloRematerialization::Run(
         return Status::OK();
       }));
 
+  // The peak memory usage of the module equals the peak memory use of the entry
+  // computation plus the output size of the computation. This is because the
+  // peak memory for a computation does not include the output as this is
+  // typically accounted for in the caller.
+  const int64 before_peak_memory =
+      computation_peak_memory_.at(module->entry_computation()) +
+      module_output_size;
   VLOG(1) << "Peak memory usage of module (before): "
-          << HumanReadableNumBytes(
-                 computation_peak_memory_[module->entry_computation()]);
+          << HumanReadableNumBytes(before_peak_memory);
 
   // Run cost analysis. Operation cost is used in the heuristic for selecting
   // instructions for rematerialization.
@@ -787,9 +1216,9 @@ StatusOr<bool> HloRematerialization::Run(
 
   // Subcomputations called by the entry computation will also be
   // rematerialized.
-  TF_ASSIGN_OR_RETURN(bool changed,
-                      RematerializeComputation(module->entry_computation(),
-                                               sequence, memory_limit_bytes));
+  TF_ASSIGN_OR_RETURN(bool changed, RematerializeComputation(
+                                        module->entry_computation(), sequence,
+                                        adjusted_memory_limit_bytes));
 
   // Rematerialization can introduce dead code. This occurs if all uses of an
   // instruction are replaced with rematerializations of the instruction.
@@ -824,19 +1253,38 @@ StatusOr<bool> HloRematerialization::Run(
                    computation->instruction_count());
     }
   }
-
-  VLOG(1) << "Peak memory usage of module (after): "
-          << HumanReadableNumBytes(
-                 computation_peak_memory_[module->entry_computation()]);
+  VLOG(1) << "Rematerialized " << instructions_rematerialized_
+          << " instructions in module " << module->name() << "; "
+          << net_instructions_added_ << " net instructions added";
+  const int64 current_peak_memory =
+      computation_peak_memory_.at(module->entry_computation()) +
+      module_output_size;
+  VLOG(1) << "Peak memory usage of module now "
+          << HumanReadableNumBytes(current_peak_memory) << " ("
+          << current_peak_memory << " bytes), was "
+          << HumanReadableNumBytes(before_peak_memory) << " ("
+          << before_peak_memory << " bytes)";
+  const int64 reduced_peak_memory = before_peak_memory - current_peak_memory;
+  VLOG(1) << "Reduced peak memory by "
+          << HumanReadableNumBytes(reduced_peak_memory) << " ("
+          << reduced_peak_memory << " bytes)";
 
   XLA_VLOG_LINES(3, "After HloRematerialization:\n" + module->ToString());
 
+  if (current_peak_memory > memory_limit_bytes) {
+    LOG(WARNING) << "Can't reduce memory use below "
+                 << HumanReadableNumBytes(memory_limit_bytes)
+                 << " by rematerialization (only reduced to "
+                 << HumanReadableNumBytes(current_peak_memory) << ")";
+  }
+
   return changed;
 }
 
 /* static */ StatusOr<bool> HloRematerialization::RematerializeAndSchedule(
-    const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
-    HloModule* hlo_module, SequentialHloOrdering::HloModuleSequence* sequence) {
+    const HloRematerialization::ShapeSizeFunction& size_function,
+    int64 memory_limit_bytes, HloModule* hlo_module,
+    SequentialHloOrdering::HloModuleSequence* sequence) {
   HloRematerialization remat(size_function);
   return remat.Run(hlo_module, sequence, memory_limit_bytes);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 86e1998b89454f75b1c10d0de2118fd1034c134d..1693f93183bc59c343e3c765cb4051566d4377ef 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -21,6 +21,7 @@
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 
 namespace xla {
 
@@ -108,6 +109,23 @@ class HloRematerialization {
   // occurs.
   tensorflow::gtl::FlatMap<const HloComputation*, int64>
       computation_peak_memory_;
+
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+
+  // Set of computations which have had rematerialization
+  // applied. Rematerialization is only applied once per computation.
+  tensorflow::gtl::FlatSet<const HloComputation*> rematerialized_computations_;
+
+  // Count of the total instructions rematerialized.
+  int64 instructions_rematerialized_ = 0;
+
+  // Count of the net instructions added to the HLO module by
+  // rematerialization. This can be different than instructions_rematerialized_
+  // because some rematerializations are effectively moves in the HLO
+  // schedule. In these cases, the rematerialization instruction replaces all
+  // uses of the original instruction and the original instruction is
+  // dead. Hence, no net instructions were added.
+  int64 net_instructions_added_ = 0;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 0a4f2776891cfc932b4fc0627daaa9b5408f420a..2a1d728bc84067e6ad7f1f622216ab39b2b474d3 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -30,12 +31,16 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class HloOrderingTest : public HloTestBase {
+namespace op = xla::testing::opcode_matchers;
+
+using ::testing::_;
+
+class HloRematerializationTest : public HloTestBase {
  protected:
   // Creates and returns a computation which can benefit from
   // rematerialization. The computation looks like:
   //
-  //   F32[1] %param = {...}
+  //   F32[] %param = {...}
   //   F32[1024] %bcast = broadcast(%param)
   //   F32[1024] %negate = negate(%bcast)
   //   F32[2048] %concat_1 = concat({%negate, %negate})
@@ -52,7 +57,7 @@ class HloOrderingTest : public HloTestBase {
       const string& suffix = "") {
     auto builder = HloComputation::Builder(TestName() + suffix);
     auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+        HloInstruction::CreateParameter(0, scalar_shape_, "param"));
     auto bcast = builder.AddInstruction(
         HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
     auto negate = builder.AddInstruction(
@@ -77,7 +82,7 @@ class HloOrderingTest : public HloTestBase {
   // Creates and returns a computation which includes a while and can benefit
   // from rematerialization. The computation looks like:
   //
-  //   F32[1] %param = {...}
+  //   F32[] %param = {...}
   //   F32[1024] %bcast = broadcast(%param)
   //   F32[1] %slice_1 = slice(%bcast, {0:1})
   //   F32[1] %while = while(%slice_1, while_body, while_cond)
@@ -93,7 +98,7 @@ class HloOrderingTest : public HloTestBase {
       const string& suffix = "") {
     auto builder = HloComputation::Builder(TestName() + suffix);
     auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+        HloInstruction::CreateParameter(0, scalar_shape_, "param"));
     auto bcast = builder.AddInstruction(
         HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
     auto slice_1 = builder.AddInstruction(
@@ -127,13 +132,14 @@ class HloOrderingTest : public HloTestBase {
   }
 
   // Various shapes used in the canned computations.
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(xla::F32, {});
   const Shape vec1_shape_ = ShapeUtil::MakeShape(xla::F32, {1});
   const Shape vec1024_shape_ = ShapeUtil::MakeShape(xla::F32, {1024});
 };
 
 // Test rematerialization of a single computation produced by
 // MakeRematerializableComputation.
-TEST_F(HloOrderingTest, SingleComputation) {
+TEST_F(HloRematerializationTest, SingleComputation) {
   HloModule module(TestName());
   HloComputation* computation =
       module.AddEntryComputation(MakeRematerializableComputation());
@@ -141,11 +147,9 @@ TEST_F(HloOrderingTest, SingleComputation) {
   // Find and save the original broadcast instruction which should be
   // rematerialized.
   const HloInstruction* slice = computation->root_instruction();
-  ASSERT_EQ(HloOpcode::kSlice, slice->opcode());
+  ASSERT_THAT(slice, op::Slice(op::Concatenate(op::Broadcast(_), _)));
   const HloInstruction* concat = slice->operand(0);
-  ASSERT_EQ(HloOpcode::kConcatenate, concat->opcode());
   const HloInstruction* bcast = concat->operand(0);
-  ASSERT_EQ(HloOpcode::kBroadcast, bcast->opcode());
 
   SequentialHloOrdering::HloModuleSequence sequence;
   // Computation requires 16KB without rematerialization, but uses only 12KB
@@ -161,8 +165,7 @@ TEST_F(HloOrderingTest, SingleComputation) {
 
   // The broadcast should have been rematerialized.
   const HloInstruction* remat_bcast = concat->operand(0);
-  EXPECT_EQ(HloOpcode::kBroadcast, remat_bcast->opcode());
-  EXPECT_NE(bcast, remat_bcast);
+  EXPECT_THAT(remat_bcast, op::Broadcast(::testing::Ne(bcast)));
 
   // The rematerialized broadcast should be immediate before the concat in the
   // sequence.
@@ -175,7 +178,7 @@ TEST_F(HloOrderingTest, SingleComputation) {
 // Test rematerialization of a single computation produced by
 // MakeRematerializableComputation but with a sufficiently high memory limit
 // such that no instructions are rematerialized.
-TEST_F(HloOrderingTest, SingleComputationNoRematerialization) {
+TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
   HloModule module(TestName());
   HloComputation* computation =
       module.AddEntryComputation(MakeRematerializableComputation());
@@ -199,7 +202,7 @@ TEST_F(HloOrderingTest, SingleComputationNoRematerialization) {
 // only one computation needs to have an instruction rematerialized. The entry
 // computation should be the one chosen because rematerialization in the while
 // will presumably be more expensive.
-TEST_F(HloOrderingTest, RematerializeAroundWhile) {
+TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
   HloModule module(TestName());
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
@@ -237,7 +240,7 @@ TEST_F(HloOrderingTest, RematerializeAroundWhile) {
 // Test rematerialization of a computation which calls another computation via a
 // while. Both the entry computation and while body computation should have
 // computations rematerialized.
-TEST_F(HloOrderingTest, RematerializeEntryAndWhileBody) {
+TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
   HloModule module(TestName());
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
@@ -271,7 +274,7 @@ TEST_F(HloOrderingTest, RematerializeEntryAndWhileBody) {
 
 // Test rematerialization of a doubly nested computation. All computations
 // should have an instruction rematerialized.
-TEST_F(HloOrderingTest, RematerializeNestedComputations) {
+TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   HloModule module(TestName());
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
@@ -311,6 +314,203 @@ TEST_F(HloOrderingTest, RematerializeNestedComputations) {
   EXPECT_EQ(inner_computation->instruction_count(), 8);
 }
 
+TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
+  // Test that a single instruction is rematerialized several times. Module:
+  //
+  // Entry computation:
+  //   F32[] %param = {...}
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1024] %add_1 = add(%bcast, bcast)
+  //   F32[1024] %call_1 = call(Subcomputation, {%add_1})
+  //   F32[1024] %add_2 = add(%bcast, call_1)
+  //   F32[1024] %call_2 = call(SubComputation, {%add_2})
+  //   F32[1024] %add_3 = add(%bcast, call_2)
+  //   F32[1024] %call_3 = call(Subcomputation, {%add_3})
+  //   F32[1024] %add_4 = add(%bcast, call_3)
+  //
+  // Subcomputation:
+  //   F32[1024] %param = {...}
+  //   F32[2048] %concat = concat({%param, %param})
+  //   F32[1024] %slice = slice(%concat)
+  //
+  // The value %bcast is live across each call of Subcomputation (which requires
+  // 8KB) though the value is not used in the calls. Rematerializing %bcast
+  // across these calls reduces peak memory use from ~20KB down to ~16KB.
+  HloModule module(TestName());
+
+  HloComputation* subcomputation = nullptr;
+  {
+    auto builder = HloComputation::Builder(TestName() + ".subcomputation");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1024_shape_, "param"));
+    auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {2048}), {param, param},
+        /*dimension=*/0));
+    builder.AddInstruction(HloInstruction::CreateSlice(
+        vec1024_shape_, concat, /*start_indices=*/{0},
+        /*limit_indices=*/{1024}));
+    subcomputation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+  auto add_1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, bcast));
+  auto call_1 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_1}, subcomputation));
+  auto add_2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_1));
+  auto call_2 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_2}, subcomputation));
+  auto add_3 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_2));
+  auto call_3 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_3}, subcomputation));
+  auto add_4 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_3));
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build());
+
+  auto count_broadcasts = [](const HloComputation* computation) {
+    int64 bcast_count = 0;
+    for (auto& instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kBroadcast) {
+        bcast_count++;
+      }
+    }
+    return bcast_count;
+  };
+
+  // Before rematerialization there should be a single broadcast instruction in
+  // the graph.
+  EXPECT_EQ(count_broadcasts(entry_computation), 1);
+  EXPECT_EQ(entry_computation->instruction_count(), 9);
+
+  EXPECT_EQ(add_2->operand(0), bcast);
+  EXPECT_EQ(add_3->operand(0), bcast);
+  EXPECT_EQ(add_4->operand(0), bcast);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Pick a memory limit some where between 24KB (initial peak memory including
+  // parameter and output) and 20KB (peak memory possible with
+  // rematerialization).
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed, HloRematerialization::RematerializeAndSchedule(
+                        ByteSizeOf,
+                        /*memory_limit_bytes=*/22 * 1024, &module, &sequence));
+  EXPECT_TRUE(changed);
+
+  // The broadcast should have been rematerialized 3 times.
+  EXPECT_EQ(count_broadcasts(entry_computation), 4);
+  EXPECT_EQ(entry_computation->instruction_count(), 12);
+
+  // The operands of add_2, add_3, and add_4 should all be rematerialized
+  // broadcasts.
+  EXPECT_NE(add_2->operand(0), bcast);
+  EXPECT_THAT(add_2->operand(0), op::Broadcast(param));
+  EXPECT_NE(add_3->operand(0), bcast);
+  EXPECT_THAT(add_3->operand(0), op::Broadcast(param));
+  EXPECT_NE(add_4->operand(0), bcast);
+  EXPECT_THAT(add_4->operand(0), op::Broadcast(param));
+}
+
+class IndirectUseTest : public HloRematerializationTest,
+                        public ::testing::WithParamInterface<bool> {};
+
+TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
+  // Test that an rematerializable instruction is not rematerialized if it has
+  // an indirect use. Test is parameterized on whether the value has an indirect
+  // use, and the instruction should be rematerialized iff the value has no
+  // indirect use. Module:
+  //
+  // Entry computation:
+  //   F32[] %param = {...}
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1024] %add_1 = add(%bcast, bcast)
+  //   F32[1024] %call = call(Subcomputation, {%add_1})
+  //   F32[1024] %add_2 = add(%bcast, call)
+  //   {F32[1024], F32[1024]} %tuple = tuple(%bcast, %add_2)
+  //   F32[1024] %gte = GetTupleElememt(%tuple, 0)
+  //   F32[1024] %negate = negate(%gte)
+  //
+  // Subcomputation:
+  //   F32[1024] %param = {...}
+  //   F32[2048] %concat = concat({%param, %param})
+  //   F32[1024] %slice = slice(%concat)
+  //
+  // The value %bcast is live across the call and rematerialization of %bcast
+  // across that point would reduce peak memory use by 4KB. However, %bcast is
+  // used indirectly in the %negate so rematerialization should not happen.
+  //
+  // This test is parameterized on whether the broadcast has an indirect use or
+  // not. The indirect use is controlled by the index of the GetTupleElement
+  // instruction. If the element is 0, then the %negate operand aliases %bcast
+  // (ie %bcast is used indirectly by %negate), otherwise the %negate operand
+  // aliases %add_2.
+  const bool indirectly_used = GetParam();
+  HloModule module(TestName());
+
+  HloComputation* subcomputation = nullptr;
+  {
+    auto builder = HloComputation::Builder(TestName() + ".subcomputation");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1024_shape_, "param"));
+    auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {2048}), {param, param},
+        /*dimension=*/0));
+    builder.AddInstruction(HloInstruction::CreateSlice(
+        vec1024_shape_, concat, /*start_indices=*/{0},
+        /*limit_indices=*/{1024}));
+    subcomputation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+  auto add_1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, bcast));
+  auto call_1 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_1}, subcomputation));
+  auto add_2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_1));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({bcast, add_2}));
+  auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      vec1024_shape_, tuple, indirectly_used ? 0 : 1));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, gte));
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(entry_computation->instruction_count(), 8);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Pick a memory limit some where between 24KB (initial peak memory including
+  // parameter and output) and 20KB (peak memory possible with
+  // rematerialization).
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed, HloRematerialization::RematerializeAndSchedule(
+                        ByteSizeOf,
+                        /*memory_limit_bytes=*/22 * 1024, &module, &sequence));
+  // Rematerialization should only occur if the rematerializable instruction has
+  // no indirect uses.
+  if (indirectly_used) {
+    EXPECT_FALSE(changed);
+    EXPECT_EQ(entry_computation->instruction_count(), 8);
+  } else {
+    EXPECT_TRUE(changed);
+    EXPECT_EQ(entry_computation->instruction_count(), 9);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(IndirectUseTestInstantiation, IndirectUseTest,
+                        ::testing::Values(true, false));
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
index fdc1c0ba2d78bed66ead05cf71177ddabbe80108..2b14eca5d1b36fbe8b863cb32d64c79fb56ce761 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -68,9 +68,8 @@ void CleanNodeName(string* name) {
 }
 
 Status HloTfGraphBuilder::AddComputation(const HloComputation& computation) {
-  LOG(INFO) << "Adding computation " << computation.name();
+  VLOG(2) << "Adding computation " << computation.name();
   for (auto embedded : computation.MakeEmbeddedComputationsList()) {
-    LOG(INFO) << "Adding embedded computation " << embedded->name();
     for (auto& instruction : embedded->instructions()) {
       TF_RETURN_IF_ERROR(AddInstruction(instruction.get()));
     }
@@ -88,12 +87,18 @@ const string& HloTfGraphBuilder::GetNodeNameForInstruction(
   if (ContainsKey(instruction_to_node_name_, instruction)) {
     return instruction_to_node_name_[instruction];
   }
+  string node_name;
   // If an instruction is fused, put it in the subgraph of the fusion;
   // otherwise, put it in the computation subgraph.
-  string node_name =
-      instruction->IsFused()
-          ? GetNodeNameForInstruction(instruction->fusion_instruction())
-          : instruction->parent()->name();
+  if (instruction->IsFused()) {
+    node_name = GetNodeNameForInstruction(instruction->fusion_instruction());
+  } else {
+    node_name = instruction->parent()->name();
+    if (!instruction->metadata().op_name().empty()) {
+      // Always make computations contain TF ops but not the other way around.
+      StrAppend(&node_name, "/", instruction->metadata().op_name());
+    }
+  }
   string instruction_name = instruction->name();
   if (instruction->opcode() == HloOpcode::kParameter) {
     StrAppend(&instruction_name, ".", instruction->parameter_number());
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
index df664080228e6e5a682aa1772e89f3380c898852..6041debc4ae0ccbaad99bec9a461b640aeffbccf 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
@@ -137,6 +137,28 @@ TEST_F(HloTfGraphBuilderTest, GreaterThanOrEqualTo) {
   EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
 }
 
+TEST_F(HloTfGraphBuilderTest, IncorparateTfOpsStructure) {
+  auto builder = HloComputation::Builder("GE");
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "param1"));
+  auto ge = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
+  OpMetadata metadata;
+  metadata.set_op_name("x/y");
+  metadata.set_op_type("Y");
+  ge->set_metadata(metadata);
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 3);
+  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
+  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
+  EXPECT_EQ(graph_def.node(2).input_size(), 2);
+  EXPECT_EQ(graph_def.node(2).name(), "GE/x/y/greater-than-or-equal-to");
+  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
+}
+
 TEST_F(HloTfGraphBuilderTest, EmbeddedComputationsDiamond) {
   // Create computations with a diamond-shaped callgraph.
   auto negate_computation = CreateNegateComputation();
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 5e7bd4a7ce8a1152973979d4a8fdb790a7fbd219..6384f737b601000d5a9cc2386e5c896ca3a74b50 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -659,44 +659,6 @@ LayoutAssignment::LayoutAssignment(ComputationLayout* entry_computation_layout)
   }
 }
 
-namespace {
-
-// Given a pemutation of `{0, 1, ..., n}` `indices`, returns a permutation of
-// `{0, 1, ..., n - to_delete.size() + to_insert.size()}` by deleting the
-// indices `to_delete` wherever in `indices` they are, and inserting the indices
-// `to_insert` arbitrarily at the back.
-tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>
-DeleteAndInsertIndices(
-    std::vector<int64> to_delete, std::vector<int64> to_insert,
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64> indices) {
-  std::sort(to_delete.begin(), to_delete.end(), std::greater<int64>());
-  std::sort(to_insert.begin(), to_insert.end(), std::less<int64>());
-  for (auto index : to_delete) {
-    auto i = indices.begin();
-    while (i != indices.end()) {
-      if (*i == index) {
-        i = indices.erase(i);
-      } else {
-        if (*i > index) {
-          (*i)--;
-        }
-        ++i;
-      }
-    }
-  }
-  for (auto index : to_insert) {
-    for (auto i = indices.begin(); i != indices.end(); ++i) {
-      if (*i >= index) {
-        (*i)++;
-      }
-    }
-    indices.Add(index);
-  }
-  return indices;
-}
-
-}  // namespace
-
 std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     const Layout& output_layout, const HloInstruction* instruction,
     int64 operand_no) {
@@ -705,7 +667,8 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
   CHECK(ShapeUtil::IsArray(instruction->shape()) &&
         ShapeUtil::IsArray(operand->shape()));
 
-  if (instruction->IsElementwiseOnOperand(operand_no) &&
+  if ((instruction->IsElementwiseOnOperand(operand_no) ||
+       InstructionRequiresInputLayoutEqualToOutputLayout(instruction)) &&
       !ShapeUtil::IsScalar(operand->shape()) &&
       ShapeUtil::Rank(operand->shape()) ==
           ShapeUtil::Rank(instruction->shape())) {
@@ -719,21 +682,32 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
   }
 
   if (instruction->opcode() == HloOpcode::kReshape) {
-    // Pick the operand layout that makes the reshape a bitcast. If the reshape
-    // only inserts or deletes degenerate dimensions, we can easily compute the
-    // desired layout by accordingly inserting and deleting the elements in the
-    // minor-to-major list.
-    bool merely_inserts_or_deletes_1_sized_dims;
-    std::vector<int64> inserted_indices, deleted_indices;
-    std::tie(merely_inserts_or_deletes_1_sized_dims, deleted_indices,
-             inserted_indices) =
-        instruction->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
-    if (merely_inserts_or_deletes_1_sized_dims) {
-      Layout operand_layout = LayoutUtil::MakeLayout(
-          AsInt64Slice(DeleteAndInsertIndices(inserted_indices, deleted_indices,
-                                              output_layout.minor_to_major())));
+    // Prefer the operand layout that makes the reshape an bitcast. If any
+    // dimension bound is 1 in the operand shape, there may be several such
+    // layouts. So if 'output_layout' is a MajorToMinor layout, try if the
+    // reshape is a bitcast when using the same layout. This may avoid copy
+    // operations.
+    const Shape& output_shape = instruction->shape();
+    Shape output_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
+        output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
+        AsInt64Slice(output_layout.minor_to_major()));
+    const Shape& operand_shape = operand->shape();
+    if (LayoutUtil::IsMonotonicWithDim0Major(output_layout)) {
+      Shape operand_shape_with_layout =
+          ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+              operand_shape.element_type(),
+              AsInt64Slice(operand_shape.dimensions()));
+      if (ShapeUtil::ReshapeIsBitcast(operand_shape_with_layout,
+                                      output_shape_with_layout)) {
+        return MakeUnique<Layout>(operand_shape_with_layout.layout());
+      }
+    }
+    auto aligned_operand_shape =
+        ShapeUtil::AlignLayouts(output_shape_with_layout, operand_shape);
+    if (aligned_operand_shape) {
+      auto operand_layout = aligned_operand_shape.value().layout();
       TF_CHECK_OK(
-          LayoutUtil::ValidateLayoutForShape(operand_layout, operand->shape()));
+          LayoutUtil::ValidateLayoutForShape(operand_layout, operand_shape));
       return MakeUnique<Layout>(operand_layout);
     }
   }
@@ -768,18 +742,32 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
   }
 
   if (user->opcode() == HloOpcode::kReshape) {
-    // Pick the user layout that makes the reshape a bitcast.
-    bool merely_inserts_or_deletes_1_sized_dims;
-    std::vector<int64> inserted_indices, deleted_indices;
-    std::tie(merely_inserts_or_deletes_1_sized_dims, deleted_indices,
-             inserted_indices) =
-        user->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
-    if (merely_inserts_or_deletes_1_sized_dims) {
-      Layout user_layout = LayoutUtil::MakeLayout(AsInt64Slice(
-          DeleteAndInsertIndices(deleted_indices, inserted_indices,
-                                 operand_layout.minor_to_major())));
+    // Prefer the user layout that makes the reshape an bitcast. If any
+    // dimension bound is 1 in the user shape, there may be several such
+    // layouts. So if 'operand_layout' is a MajorToMinor layout, try if the
+    // reshape is a bitcast when using the same layout. This may avoid copy
+    // operations.
+    Shape operand_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
+        operand->shape().element_type(),
+        AsInt64Slice(operand->shape().dimensions()),
+        AsInt64Slice(operand_layout.minor_to_major()));
+    const Shape& output_shape = user->shape();
+    if (LayoutUtil::IsMonotonicWithDim0Major(operand_layout)) {
+      Shape output_shape_with_layout =
+          ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+              output_shape.element_type(),
+              AsInt64Slice(output_shape.dimensions()));
+      if (ShapeUtil::ReshapeIsBitcast(output_shape_with_layout,
+                                      operand_shape_with_layout)) {
+        return MakeUnique<Layout>(output_shape_with_layout.layout());
+      }
+    }
+    auto aligned_user_shape =
+        ShapeUtil::AlignLayouts(operand_shape_with_layout, output_shape);
+    if (aligned_user_shape) {
+      auto user_layout = aligned_user_shape.value().layout();
       TF_CHECK_OK(
-          LayoutUtil::ValidateLayoutForShape(user_layout, user->shape()));
+          LayoutUtil::ValidateLayoutForShape(user_layout, output_shape));
       return MakeUnique<Layout>(user_layout);
     }
   }
@@ -1040,7 +1028,7 @@ StatusOr<Layout> InferArrayLayout(
                                   *first_buffer_layout)) {
       // The points-to set is ambiguous for this index and the different source
       // buffers have different layouts. This case is possible in valid XLA
-      // computations because we do not propagate BufferLayoutConstaints to all
+      // computations because we do not propagate BufferLayoutConstraints to all
       // LogicalBuffers which may alias the constrained LogicalBuffer at some
       // point in the computation.
       return FailedPrecondition(
@@ -1253,7 +1241,7 @@ Status LayoutAssignment::RunOnComputation(
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(computation->parent()));
 
-  // Construct LayoutConstaints with all layout constraints of the computation.
+  // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(*points_to_analysis, computation);
 
   // Add constraints required for correctness on all backends (eg, entry
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 61dc7b120752d57cf09423f38546441de2fc8dd9..4f586c334dcdcb02cd7586750d39d6663c0f2703 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -248,6 +248,15 @@ class LayoutAssignment : public HloPassInterface {
     return Status::OK();
   }
 
+  // This method can be overriden to mark instructions as requiring the operands
+  // to have the same layout as the result, for performance or correctness. This
+  // will propagate constraints through the instruction from the result into the
+  // operands.
+  virtual bool InstructionRequiresInputLayoutEqualToOutputLayout(
+      const HloInstruction* instruction) {
+    return false;
+  }
+
   // Construct contraints and assign layouts to all instructions in the
   // computation satisfying the given ComputationLayout. Layouts constraints are
   // added, then propagated until all LogicalBuffers in the computation are
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index b6451738bdb4df8ce06efc2becd9f14aef92254d..c6df9839c33a86ee4d96ccece6ffdf4f496bc6fc 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -45,6 +45,8 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+
 class LayoutAssignmentTest : public HloTestBase {
  protected:
   void AssignLayouts(HloModule* module,
@@ -317,7 +319,7 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   // param -> log -> reshape -> tanh
   auto builder = HloComputation::Builder(TestName());
   Shape ashape = ShapeUtil::MakeShape(F32, {1, 2, 3, 1});
-  Shape bshape = ShapeUtil::MakeShape(F32, {2, 1, 3});
+  Shape bshape = ShapeUtil::MakeShape(F32, {3, 1, 2});
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ashape, "param"));
   auto log = builder.AddInstruction(
@@ -332,8 +334,8 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
-  *ashape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 1, 2, 3});
-  *bshape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 1, 2});
+  *ashape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 2, 1, 3});
+  *bshape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({2, 1, 0});
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
   *computation_layout.mutable_parameter_layout(0) =
@@ -343,12 +345,12 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
 
   auto log_minor_to_major =
       AsInt64Slice(log->shape().layout().minor_to_major());
-  EXPECT_LT(PositionInContainer(log_minor_to_major, 1),
+  EXPECT_GT(PositionInContainer(log_minor_to_major, 1),
             PositionInContainer(log_minor_to_major, 2));
 
   auto reshape_minor_to_major =
       AsInt64Slice(reshape->shape().layout().minor_to_major());
-  EXPECT_LT(PositionInContainer(reshape_minor_to_major, 0),
+  EXPECT_GT(PositionInContainer(reshape_minor_to_major, 0),
             PositionInContainer(reshape_minor_to_major, 2));
 }
 
@@ -421,8 +423,8 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       ShapeLayout(output_shape_with_layout);
   AssignLayouts(&module, &computation_layout);
 
-  EXPECT_TRUE(
-      ContainersEqual(broadcast->shape().layout().minor_to_major(), {0, 1, 2}));
+  EXPECT_THAT(broadcast->shape().layout().minor_to_major(),
+              ElementsAre(0, 1, 2));
 }
 
 TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
@@ -474,11 +476,9 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
           {transpose_shape_with_layout, broadcast2_shape_with_layout}));
   AssignLayouts(&module, &computation_layout);
 
-  EXPECT_TRUE(
-      ContainersEqual(broadcast->shape().layout().minor_to_major(), {0, 1}));
-  EXPECT_TRUE(
-      ContainersEqual(transpose->shape().layout().minor_to_major(), {1, 0}));
-  EXPECT_TRUE(ContainersEqual(tanh->shape().layout().minor_to_major(), {0, 1}));
+  EXPECT_THAT(broadcast->shape().layout().minor_to_major(), ElementsAre(0, 1));
+  EXPECT_THAT(transpose->shape().layout().minor_to_major(), ElementsAre(1, 0));
+  EXPECT_THAT(tanh->shape().layout().minor_to_major(), ElementsAre(0, 1));
 }
 
 // Add test which fails due to copy tuple.
diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc
index 6c5f185ed1ba544e4132777216b9594b5cad7904..16e11ca6c6b3c5ef4dea3cbab5ba6c284e716add 100644
--- a/tensorflow/compiler/xla/service/liveness_util.cc
+++ b/tensorflow/compiler/xla/service/liveness_util.cc
@@ -28,8 +28,9 @@ limitations under the License.
 
 namespace xla {
 
-bool DoesNotUseOperandBuffer(HloInstruction* operand, const ShapeIndex& index,
-                             HloInstruction* user,
+bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                             const ShapeIndex& index,
+                             const HloInstruction* user,
                              const TuplePointsToAnalysis& points_to_analysis) {
   CHECK(user->IsUserOf(operand))
       << "user: " << user->ToString() << " operand: " << operand->ToString();
@@ -98,6 +99,41 @@ std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
   return uses;
 }
 
+// Returns true if there is exactly one use of 'operand' at 'operand_index'
+// in 'fusion.fused_instructions', where the singleton use is the fused
+// root at operand index 'use_operand_index'. Returns false otherwise.
+//
+// REQUIRES: 'fusion' opcode is a kFusion instruction.
+bool HasUniqueFusedUseOfOperandAt(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* fusion, const int64 use_operand_index,
+    const TuplePointsToAnalysis& points_to_analysis) {
+  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
+  // Check that 'operand' is unique in the operand list of 'fusion'.
+  if (fusion->OperandIndices(operand).size() > 1) {
+    return false;
+  }
+  // Find fusion parameter associated with 'operand'.
+  const auto& fused_params = fusion->fused_parameters();
+  auto fused_param_it = std::find_if(
+      fused_params.begin(), fused_params.end(),
+      [&](HloInstruction* fused_param) {
+        return fusion->operand(fused_param->parameter_number()) == operand;
+      });
+  if (fused_param_it == fused_params.end()) {
+    return false;
+  }
+  auto* fused_param = *fused_param_it;
+  // Get all uses of 'operand' at 'index' from 'fusion.fused_instructions'.
+  auto fused_param_uses = GetAllUsesOfInstructionAtIndex(
+      fused_param, operand_index, points_to_analysis);
+  // Return true iff there is exactly one use of 'operand' at 'index', and
+  // this singleton use is the fused root (at index in 'use_operand_indices').
+  return fused_param_uses.size() == 1 &&
+         fused_param_uses[0].first == fusion->fused_expression_root() &&
+         fused_param_uses[0].second == use_operand_index;
+}
+
 }  // namespace
 
 // User and operand can share buffers iff both instructions emit the same shape
@@ -106,6 +142,9 @@ std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
 // *) Is a loop fusion instruction where the only use of 'operand' at 'index'
 //    in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root
 //    at operand 0. Or...
+// *) Is a kDot -> kAdd (or fused kTransposeDot -> kAdd) output fusion
+//    instruction where the only use of 'operand' at 'index' in the set
+//    'user.fused_instructions' is a kAdd fused root at operand 0 or 1. Or...
 // *) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index 0.
 bool CanShareOperandBufferWithUser(
     HloInstruction* operand, const ShapeIndex& operand_index,
@@ -125,30 +164,46 @@ bool CanShareOperandBufferWithUser(
   if (user->opcode() == HloOpcode::kCopy) {
     return false;
   }
-  // Check if 'user' is a loop fusion instruction with a kDynamicUpdateSlice
-  // fused root instruction.
-  if (user->opcode() == HloOpcode::kFusion &&
-      user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-      user->fused_expression_root()->opcode() ==
-          HloOpcode::kDynamicUpdateSlice) {
-    for (auto& fused_param : user->fused_parameters()) {
-      // Find fusion parameter associated with 'operand'.
-      if (user->operand(fused_param->parameter_number()) != operand) {
-        continue;
-      }
-      // Get all uses of 'operand' at 'index' from 'user.fused_instructions'.
-      auto fused_param_uses = GetAllUsesOfInstructionAtIndex(
-          fused_param, operand_index, points_to_analysis);
-      // Return true iff there is exactly one use of 'operand' at 'index', and
-      // this singleton use is the fused root at operand index 0.
-      if (fused_param_uses.size() == 1 &&
-          fused_param_uses[0].first == user->fused_expression_root() &&
-          fused_param_uses[0].second == 0) {
-        return true;
+  if (user->opcode() == HloOpcode::kFusion) {
+    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
+        user->fused_expression_root()->opcode() ==
+            HloOpcode::kDynamicUpdateSlice) {
+      // Loop fusion with kDynamicUpdateSlice fused root.
+      //
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root at operand
+      // index 0.
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0,
+                                          points_to_analysis);
+    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
+      // Output fusion with kAdd fused root.
+
+      // Check if one operand of kAdd fused root is either kDot, or nested
+      // kFusion of kind kTransposeDot.
+      auto* add = user->fused_expression_root();
+      auto add_operand_it =
+          std::find_if(add->operands().begin(), add->operands().end(),
+                       [&](HloInstruction* operand) {
+                         return operand->opcode() == HloOpcode::kDot ||
+                                (operand->opcode() == HloOpcode::kFusion &&
+                                 operand->fusion_kind() ==
+                                     HloInstruction::FusionKind::kTransposeDot);
+                       });
+      if (add_operand_it == add->operands().end()) {
+        return false;
       }
-      break;
+      auto* matched_add_operand = *add_operand_it;
+      // Calculate operand index of 'add' operand which was not matched above.
+      const int64 other_add_operand_index =
+          matched_add_operand == add->operand(0) ? 1 : 0;
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root (at operand
+      // index 'other_add_operand_index').
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user,
+                                          other_add_operand_index,
+                                          points_to_analysis);
     }
-    return false;
   }
   if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
       user->opcode() == HloOpcode::kWhile) {
diff --git a/tensorflow/compiler/xla/service/liveness_util.h b/tensorflow/compiler/xla/service/liveness_util.h
index 410a7b1b519e117f21c01938cb8e4a5b1c358ad2..52de282ca6b444867c865f845ce794196c98b277 100644
--- a/tensorflow/compiler/xla/service/liveness_util.h
+++ b/tensorflow/compiler/xla/service/liveness_util.h
@@ -32,8 +32,9 @@ namespace xla {
 // 'operand'. Returns false otherwise.
 //
 // REQUIRES: 'operand' is an operand of 'user'.
-bool DoesNotUseOperandBuffer(HloInstruction* operand, const ShapeIndex& index,
-                             HloInstruction* user,
+bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                             const ShapeIndex& index,
+                             const HloInstruction* user,
                              const TuplePointsToAnalysis& points_to_analysis);
 
 // Returns true if 'user' (at 'user_index') can share a buffer with its operand
diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc
index 1ee02925117f846ee3ad41e151b125e57db22904..49c2c2d4a268d1237ae04903416cf1f6708609d3 100644
--- a/tensorflow/compiler/xla/service/liveness_util_test.cc
+++ b/tensorflow/compiler/xla/service/liveness_util_test.cc
@@ -34,9 +34,7 @@ class PointsToAnalysisTestBase : public HloTestBase {
   void RunAnalysis() {
     CHECK_NOTNULL(module_.get());
     points_to_analysis_ =
-        TuplePointsToAnalysis::Run(module_.get(),
-                                   /*include_loop_fusion_instructions=*/true)
-            .ConsumeValueOrDie();
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
   }
 
   void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
@@ -231,6 +229,100 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
       CanShareOperandBufferWithUser(starts, {}, dus, {}, *points_to_analysis_));
 }
 
+TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, dot}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused dot add should be able to share buffer with 'add_operand'.
+  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
+                                            *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+  auto b_t = builder.AddInstruction(
+      HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
+
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b_t));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+
+  auto nested_fusion = computation_->CreateFusionInstruction(
+      {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
+
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, nested_fusion}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused transpose-dot-add should be share buffer with 'add_operand'.
+  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
+                                            *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto reverse = builder.AddInstruction(
+      HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
+
+  auto two = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, two, reverse}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
+  EXPECT_FALSE(CanShareOperandBufferWithUser(operand, {}, fusion, {},
+                                             *points_to_analysis_));
+}
+
 TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
   Shape data_shape = ShapeUtil::MakeShape(F32, {8});
 
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 17d7b97b21bd3296711295e0779b0a273c9917e0..78d21233c765ec8f18a865f55b752d418ad126d6 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -60,9 +60,12 @@ namespace xla {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Backend> backend,
-      Backend::CreateBackend(platform, options.number_of_replicas()));
+  BackendOptions backend_options;
+  backend_options.set_platform(platform)
+      .set_number_of_replicas(options.number_of_replicas())
+      .set_intra_op_parallelism_threads(options.intra_op_parallelism_threads());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> backend,
+                      Backend::CreateBackend(backend_options));
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
                       CreateComputeConstantBackend());
@@ -77,21 +80,6 @@ LocalService::LocalService(std::unique_ptr<Backend> execute_backend,
   runs_in_client_process_ = true;
 }
 
-tensorflow::Status LocalService::ResolveArguments(
-    const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs) {
-  TF_ASSIGN_OR_RETURN(std::vector<const Allocation*> arg_allocations,
-                      ResolveAndValidateArguments(
-                          arguments, execute_backend_.get(), device_ordinal));
-  argument_ptrs->resize(arg_allocations.size());
-  for (int i = 0; i < arguments.size(); ++i) {
-    const Allocation& allocation = *arg_allocations[i];
-    (*argument_ptrs)[i] = allocation.device_memory();
-  }
-  return tensorflow::Status::OK();
-}
-
 namespace {
 // Returns the space required to allocate a shape. If
 // allocate_space_for_deep_copy the space includes all sub-buffers of
@@ -128,70 +116,6 @@ StatusOr<GlobalDataHandle> LocalService::AllocateBufferOnDevice(
                                   allocation_size));
 }
 
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-LocalService::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-        computations,
-    const AotCompilationOptions& options) {
-  std::vector<std::unique_ptr<HloModule>> hlo_modules;
-  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
-  for (const AheadOfTimeComputationInstance& instance : computations) {
-    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                        computation_tracker_.Resolve(instance.computation));
-    VersionedComputationHandle versioned_handle =
-        user_computation->GetVersionedHandle();
-
-    // Dump computation proto state if flag is set.
-    legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-    const string& directory_path = flags->xla_dump_computations_to;
-    if (!directory_path.empty()) {
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<SessionModule> session_module,
-          computation_tracker_.SnapshotComputation(versioned_handle.handle));
-      string filename = tensorflow::strings::StrCat(
-          "computation_", versioned_handle.handle.handle(), "__",
-          session_module->entry().name(), "__version_",
-          versioned_handle.version);
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
-                                                     *session_module));
-    }
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                        computation_tracker_.BuildHloModule(
-                            versioned_handle,
-                            /*include_unreachable_instructions=*/true));
-    hlo_modules.push_back(std::move(hlo_module));
-
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        user_computation->ComputeProgramShape(versioned_handle.version));
-
-    module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
-    HloModuleConfig* module_config = module_configs.back().get();
-    auto* computation_layout =
-        module_config->mutable_entry_computation_layout();
-    if (flags->xla_hlo_profile) {
-      module_config->enable_hlo_profiling(true);
-    }
-    for (int i = 0; i < instance.argument_layouts.size(); ++i) {
-      const Shape& argument_layout = *instance.argument_layouts[i];
-      if (ShapeUtil::IsTuple(argument_layout)) {
-        return Unimplemented("tuple arguments not supported yet");
-      }
-      TF_RETURN_IF_ERROR(
-          computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
-              argument_layout));
-    }
-    TF_RETURN_IF_ERROR(
-        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
-            *instance.result_layout));
-  }
-
-  return execute_backend_->compiler()->CompileAheadOfTime(
-      std::move(hlo_modules), std::move(module_configs), MakeHloDumper(),
-      options);
-}
-
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index df27f0a7a60dca99caf09994f417f1bc45ec15de..767a3ab697febb283af448b25369445152381a5e 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -43,14 +43,6 @@ class LocalService : public Service {
   static StatusOr<std::unique_ptr<LocalService>> NewService(
       const ServiceOptions& options);
 
-  // For an array of arguments, validate that each is placed on the
-  // specified device_ordinal, and return the DeviceMemoryBase
-  // corresponding to each argument.
-  tensorflow::Status ResolveArguments(
-      const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs);
-
   // Return a handle to a buffer large enough to hold shape, allocated
   // on device_ordinal. If allocate_space_for_deep_copy, the buffer is
   // large enough to hold all sub-buffers of a tuple shape, otherwise
@@ -59,22 +51,6 @@ class LocalService : public Service {
       const Shape& shape, int device_ordinal,
       bool allocate_space_for_deep_copy);
 
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AheadOfTimeComputationInstance {
-    ComputationHandle computation;
-    std::vector<const Shape*> argument_layouts;
-    const Shape* result_layout = nullptr;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation.  See
-  // |LocalClient::CompileAheadOfTime| for additional details.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-          computations,
-      const AotCompilationOptions& Options);
-
   // Builds an Executable with the given argument layouts and options. If
   // result_layout is non-null, then the executable is compiled to produce a
   // result of the given layout.
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index b72ef95a6a7964aa1f41cd2ceef4cdee76e9f708..768977ba6bba2f9af55fcd467aa3d91488e4bf0f 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -13,17 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/reshape_mover.h"
-
-#include <algorithm>
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/util.h"
-
-namespace xla {
-
-namespace {
-
+// Implementation note:
+//
 // The general idea behind this pass is that we're converting from this:
 //   %param.A = OldShape
 //   %param.B = OldShape
@@ -44,6 +35,19 @@ namespace {
 // only implicit scalar broadcast is on Pred, not on A or B. Since reshapes or
 // transposes to a scalar should be cheap, we simply never move them.
 
+#include "tensorflow/compiler/xla/service/reshape_mover.h"
+
+#include <algorithm>
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
 // Finds the first non-scalar operand of an instruction that is a reshape or
 // transpose and returns the operand if it is found or nullptr if not found.
 HloInstruction* FirstNonScalarReshapeOperand(const HloInstruction* hlo) {
@@ -51,6 +55,9 @@ HloInstruction* FirstNonScalarReshapeOperand(const HloInstruction* hlo) {
     if (!ShapeUtil::IsScalar(operand->shape()) &&
         (operand->opcode() == HloOpcode::kReshape ||
          operand->opcode() == HloOpcode::kTranspose)) {
+      VLOG(5) << "Found first non-scalar reshape operand of "
+              << hlo->ToStringNoMetadata() << ":\n\t"
+              << operand->ToStringNoMetadata();
       return operand;
     }
   }
@@ -70,6 +77,9 @@ bool OperandCanTrivallyChangeShape(const HloInstruction* instruction,
   // A constant can trivially reshape the literal it holds.
   if (operand->opcode() == HloOpcode::kConstant &&
       ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
+    VLOG(5) << "Constant had same dimensions as instruction:\n\toperand: "
+            << operand->ToStringNoMetadata()
+            << "\n\tinstruction: " << instruction->ToStringNoMetadata();
     return true;
   }
 
@@ -116,119 +126,159 @@ bool IsElementwiseOfEquivalentReshapesOrTransposes(
   if (!first_reshape_operand) {
     return false;
   }
-  return (instruction->user_count() > 0 ||
-          instruction == instruction->parent()->root_instruction()) &&
-         instruction->IsElementwise() && !operands.empty() &&
-         // Check whether all operands:
-         //    1. are all reshapes or transposes that have the same input and
-         //    output shapes as all other reshaped or transposed operands.
-         //      or
-         //    2. can be any shape like kConstant, kRng, and scalars.
-         std::all_of(
-             operands.begin(), operands.end(),
-             [instruction,
-              first_reshape_operand](const HloInstruction* operand) {
-               return AreEquivalentReshapes(first_reshape_operand, operand) ||
-                      OperandCanTrivallyChangeShape(instruction, operand);
-             });
+  VLOG(3) << "** Checking whether instruction is an elementwise operation of "
+             "equivalent reshapes/transposes: "
+          << instruction->ToStringNoMetadata();
+  bool result =
+      (instruction->user_count() > 0 ||
+       instruction == instruction->parent()->root_instruction()) &&
+      instruction->IsElementwise() && !operands.empty() &&
+      // Check whether all operands:
+      //    0. Have the same dimensions as the output -- if not, it may be
+      //       implicitly broadcast, which can confound the movement's
+      //       correctness.
+      //    1. Are all reshapes or transposes that have the same input and
+      //       output shapes as all other reshaped or transposed operands.
+      //     or
+      //    2. Can be any shape like kConstant, kRng, and scalars.
+      std::all_of(
+          operands.begin(), operands.end(),
+          [instruction, first_reshape_operand](const HloInstruction* operand) {
+            if (!ShapeUtil::SameDimensions(operand->shape(),
+                                           instruction->shape())) {
+              VLOG(5) << "Operand shape differs from output shape; may be "
+                         "implicitly broadcast, so preventing "
+                         "movement\n\toperand: "
+                      << operand->ToStringNoMetadata() << "\n\tinstruction: "
+                      << instruction->ToStringNoMetadata();
+              return false;
+            }
+            if (AreEquivalentReshapes(first_reshape_operand, operand)) {
+              VLOG(5) << "Are equivalent reshapes:\n\tfirst_reshape_operand: "
+                      << first_reshape_operand->ToStringNoMetadata()
+                      << "\n\toperand: " << operand->ToStringNoMetadata();
+              return true;
+            }
+            if (OperandCanTrivallyChangeShape(instruction, operand)) {
+              VLOG(5) << "Operand can trivially change shape: "
+                      << operand->ToStringNoMetadata();
+              return true;
+            }
+            return false;
+          });
+  VLOG(3) << "ElementwiseOfEquivalentReshapesOrTransposes result for "
+          << instruction->ToStringNoMetadata() << ": " << result;
+  return result;
 }
 
 // Try to sink any reshape or transpose operands of `instruction` across it. We
 // do so if `instruction` is elementwise and all operands are equivalent
 // reshapes or transposes.
-bool TrySinkReshapeOrTranspose(HloComputation* computation,
-                               HloInstruction* instruction) {
-  if (IsElementwiseOfEquivalentReshapesOrTransposes(instruction)) {
-    std::vector<HloInstruction*> operands = instruction->operands();
-    HloInstruction* old_reshape = FirstNonScalarReshapeOperand(instruction);
-    CHECK(old_reshape != nullptr);
-    Shape new_elementwise_shape = old_reshape->operand(0)->shape();
-    for (size_t i = 0; i < operands.size(); ++i) {
-      // All scalar operands remain as-is, even if they're reshape or transpose,
-      // to simplify handling wrt special scalar broadcast rules for ops like
-      // Select. Scalar reshapes should be cheap anyways.
-      if (ShapeUtil::IsScalar(operands[i]->shape())) {
-        continue;
-      }
-      auto element_type = operands[i]->shape().element_type();
-      switch (operands[i]->opcode()) {
-        case HloOpcode::kConstant: {
-          if (old_reshape->opcode() == HloOpcode::kReshape) {
-            operands[i] = instruction->parent()->AddInstruction(
-                HloInstruction::CreateReshape(
-                    ShapeUtil::ChangeElementType(new_elementwise_shape,
-                                                 element_type),
-                    operands[i]));
-          } else {
-            CHECK_EQ(old_reshape->opcode(), HloOpcode::kTranspose);
-            std::vector<int64> inverse_permutation =
-                InversePermutation(old_reshape->dimensions());
-            operands[i] = instruction->parent()->AddInstruction(
-                HloInstruction::CreateTranspose(
-                    ShapeUtil::ChangeElementType(new_elementwise_shape,
-                                                 element_type),
-                    operands[i], inverse_permutation));
-          }
-          break;
-        }
-        case HloOpcode::kRng: {
-          CHECK_EQ(operands[i]->user_count(), 1);
+StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
+                                         HloInstruction* instruction) {
+  if (!IsElementwiseOfEquivalentReshapesOrTransposes(instruction)) {
+    return false;
+  }
+
+  std::vector<HloInstruction*> operands = instruction->operands();
+  HloInstruction* old_reshape = FirstNonScalarReshapeOperand(instruction);
+  TF_RET_CHECK(old_reshape != nullptr);
+  Shape new_elementwise_shape = old_reshape->operand(0)->shape();
+
+  VLOG(3) << "** Trying to sink reshape or transpose: "
+          << instruction->ToStringNoMetadata()
+          << "\n\told reshape: " << old_reshape->ToStringNoMetadata()
+          << "\n\tnew elementwise shape: "
+          << ShapeUtil::HumanString(new_elementwise_shape);
+  for (size_t i = 0; i < operands.size(); ++i) {
+    // All scalar operands remain as-is, even if they're reshape or transpose,
+    // to simplify handling wrt special scalar broadcast rules for ops like
+    // Select. Scalar reshapes should be cheap anyways.
+    if (ShapeUtil::IsScalar(operands[i]->shape())) {
+      continue;
+    }
+    PrimitiveType element_type = operands[i]->shape().element_type();
+    switch (operands[i]->opcode()) {
+      case HloOpcode::kConstant: {
+        if (old_reshape->opcode() == HloOpcode::kReshape) {
+          VLOG(3) << "Creating reshape for kConstant operand " << i << ": "
+                  << operands[i]->ToStringNoMetadata();
           operands[i] = instruction->parent()->AddInstruction(
-              operands[i]->CloneWithNewOperands(
+              HloInstruction::CreateReshape(
                   ShapeUtil::ChangeElementType(new_elementwise_shape,
                                                element_type),
-                  operands[i]->operands()));
-          break;
+                  operands[i]));
+        } else {
+          TF_RET_CHECK(old_reshape->opcode() == HloOpcode::kTranspose);
+          std::vector<int64> inverse_permutation =
+              InversePermutation(old_reshape->dimensions());
+          operands[i] = instruction->parent()->AddInstruction(
+              HloInstruction::CreateTranspose(
+                  ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                               element_type),
+                  operands[i], inverse_permutation));
         }
-        case HloOpcode::kReshape:
-        case HloOpcode::kTranspose:
-          operands[i] = operands[i]->mutable_operand(0);
-          break;
-        default:
-          LOG(FATAL) << "Unexpected opcode while trying to sink reshapes or "
-                        "transposes.";
+        break;
       }
-    }
-    if (HloOpcode::kFusion == instruction->opcode()) {
-      // Here we already know `instruction` is elementwise, and no operand is
-      // implicit broadcast as if it were the operands would not be equivalent
-      // reshapes, so all the fused instructions have the same dimensions.
-      for (const auto& fused_instruction : instruction->fused_instructions()) {
-        Shape* shape = fused_instruction->mutable_shape();
-        *shape->mutable_dimensions() = new_elementwise_shape.dimensions();
-        *shape->mutable_layout() = new_elementwise_shape.layout();
+      case HloOpcode::kRng: {
+        CHECK_EQ(operands[i]->user_count(), 1);
+        operands[i] = instruction->parent()->AddInstruction(
+            operands[i]->CloneWithNewOperands(
+                ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                             element_type),
+                operands[i]->operands()));
+        break;
       }
-    }
-    auto new_elementwise =
-        computation->AddInstruction(instruction->CloneWithNewOperands(
-            // `instruction` may change the element type, e.g., from
-            //   operands[0] -> reshape -> convert (`instruction`)
-            // to
-            //   operands[0] -> convert' -> reshape'
-            //
-            // In this case, convert' should have the same element type as
-            // `convert` and the same dimensions as operands[0].
-            ShapeUtil::ChangeElementType(new_elementwise_shape,
-                                         instruction->shape().element_type()),
-            operands));
-    std::unique_ptr<HloInstruction> new_reshape;
-    switch (old_reshape->opcode()) {
       case HloOpcode::kReshape:
-        new_reshape = HloInstruction::CreateReshape(instruction->shape(),
-                                                    new_elementwise);
-        break;
       case HloOpcode::kTranspose:
-        new_reshape = HloInstruction::CreateTranspose(
-            instruction->shape(), new_elementwise, old_reshape->dimensions());
+        operands[i] = operands[i]->mutable_operand(0);
         break;
       default:
-        LOG(FATAL) << "Bad opcode";
+        LOG(FATAL) << "Unexpected opcode while trying to sink reshapes or "
+                      "transposes.";
     }
-    TF_CHECK_OK(computation->ReplaceWithNewInstruction(instruction,
-                                                       std::move(new_reshape)));
-    return true;
   }
-  return false;
+  if (HloOpcode::kFusion == instruction->opcode()) {
+    // Here we already know `instruction` is elementwise, and no operand is
+    // implicit broadcast as if it were the operands would not be equivalent
+    // reshapes, so all the fused instructions have the same dimensions.
+    for (const auto& fused_instruction : instruction->fused_instructions()) {
+      Shape* shape = fused_instruction->mutable_shape();
+      *shape->mutable_dimensions() = new_elementwise_shape.dimensions();
+      *shape->mutable_layout() = new_elementwise_shape.layout();
+    }
+  }
+  HloInstruction* new_elementwise =
+      computation->AddInstruction(instruction->CloneWithNewOperands(
+          // `instruction` may change the element type, e.g., from
+          //   operands[0] -> reshape -> convert (`instruction`)
+          // to
+          //   operands[0] -> convert' -> reshape'
+          //
+          // In this case, convert' should have the same element type as
+          // `convert` and the same dimensions as operands[0].
+          ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                       instruction->shape().element_type()),
+          operands));
+
+  std::unique_ptr<HloInstruction> new_reshape;
+  switch (old_reshape->opcode()) {
+    case HloOpcode::kReshape:
+      VLOG(3) << "Creating new reshape for new elementwise op: "
+              << new_elementwise->ToStringNoMetadata();
+      new_reshape =
+          HloInstruction::CreateReshape(instruction->shape(), new_elementwise);
+      break;
+    case HloOpcode::kTranspose:
+      new_reshape = HloInstruction::CreateTranspose(
+          instruction->shape(), new_elementwise, old_reshape->dimensions());
+      break;
+    default:
+      LOG(FATAL) << "Bad opcode";
+  }
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      instruction, std::move(new_reshape)));
+  return true;
 }
 
 }  // namespace
@@ -237,9 +287,9 @@ StatusOr<bool> ReshapeMover::Run(HloModule* module) {
   bool changed = false;
   for (const auto& comp : module->computations()) {
     for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
-      if (TrySinkReshapeOrTranspose(comp.get(), instruction)) {
-        changed = true;
-      }
+      TF_ASSIGN_OR_RETURN(bool did_change,
+                          TrySinkReshapeOrTranspose(comp.get(), instruction));
+      changed |= did_change;
     }
   }
   return changed;
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 1831d775d4a0d8e4e60a31eb91dd1ca4393ec398..5217e85d4fc12e2adc412644b8f11fd11a58039a 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -234,6 +234,58 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
   EXPECT_EQ(select, computation->root_instruction());
 }
 
+// Tree looks like:
+//
+// param0 [1,128,1]
+//  |
+// reshape [128,1]          constant [128,1024]
+//   \                         /
+//     multiply w/implicit broadcast [128,1024]
+//
+// The reshape mover would like to sink the reshape below the multiply.
+//
+// Previously we would attempt to insert a reshape of the constant to [1,128,1]
+// (which is unsound, because it has a different number of elements) as
+// preparation for sinking the reshape.
+//
+// To eliminate the unsoundness, we outlaw reshape sinking when one of the
+// operands is implicitly broadcast in the elementwise consumer.
+//
+// TODO(b/37799338) However, it would be possible in this case to do a more
+// in-depth analysis to get reshape movement to occur:
+//
+// 1. Note that the broadcast dimension (logical dimension 1) in the operands
+//    would map back to logical dimension 2 in the param0 node.
+// 2. Match rank of the constant to the param0 node (by prepending a trivial 1
+//    dimension).
+// 3. Reshape to [128,1024] at the root.
+//
+// But this is not currently done.
+TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) {
+  HloComputation::Builder builder(TestName());
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 128, 1}), "param0"));
+  auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(F32, {128, 1}), param0));
+  Array2D<float> a(128, 1024);
+  auto literal = LiteralUtil::CreateR2FromArray2D<float>(a);
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  auto multiply = builder.AddInstruction(HloInstruction::CreateBinary(
+      constant->shape(), HloOpcode::kMultiply, constant, reshape));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Constant(), op::Reshape(param0)));
+
+  EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Constant(), op::Reshape(param0)));
+  EXPECT_EQ(multiply, computation->root_instruction());
+}
+
 // Tree looks like this:
 //
 // add1
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 451bb8c7eadf3e2210788a722d8f75aa3050e30f..42450dfcae4be71af1002efb72b75857d5c80015 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -112,6 +112,16 @@ ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) {
 
 int ServiceOptions::number_of_replicas() const { return number_of_replicas_; }
 
+ServiceOptions& ServiceOptions::set_intra_op_parallelism_threads(
+    int num_threads) {
+  intra_op_parallelism_threads_ = num_threads;
+  return *this;
+}
+
+int ServiceOptions::intra_op_parallelism_threads() const {
+  return intra_op_parallelism_threads_;
+}
+
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
     perftools::gputools::Platform* platform) {
   ServiceOptions default_options;
@@ -126,9 +136,10 @@ int ServiceOptions::number_of_replicas() const { return number_of_replicas_; }
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
-  TF_ASSIGN_OR_RETURN(
-      execute_backend,
-      Backend::CreateBackend(platform, options.number_of_replicas()));
+  BackendOptions backend_options;
+  backend_options.set_platform(platform);
+  backend_options.set_number_of_replicas(options.number_of_replicas());
+  TF_ASSIGN_OR_RETURN(execute_backend, Backend::CreateBackend(backend_options));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
                       CreateComputeConstantBackend());
   std::unique_ptr<Service> service(new Service(
@@ -142,7 +153,10 @@ Service::CreateComputeConstantBackend() {
                       PlatformUtil::GetSupportedPlatforms());
   for (auto* platform : platforms) {
     if (platform->id() == se::host::kHostPlatformId) {
-      return Backend::CreateBackend(platform, /*replica_count=*/1);
+      BackendOptions backend_options;
+      backend_options.set_platform(platform);
+      backend_options.set_number_of_replicas(1);
+      return Backend::CreateBackend(backend_options);
     }
   }
   return NotFound("CPU platform not found");
@@ -180,20 +194,24 @@ Service::Service(std::unique_ptr<Backend> execute_backend,
                  std::unique_ptr<Backend> compute_constant_backend)
     : execute_backend_(std::move(execute_backend)),
       compute_constant_backend_(std::move(compute_constant_backend)) {
-  LOG(INFO) << Printf(
-      "XLA service %p executing computations on platform %s. Devices:", this,
-      execute_backend_->platform()->Name().c_str());
-  for (int i = 0; i < execute_backend_->device_count(); ++i) {
-    if (execute_backend_->device_ordinal_supported(i)) {
-      se::StreamExecutor* executor =
-          execute_backend_->stream_executor(i).ValueOrDie();
-      const auto& description = executor->GetDeviceDescription();
-      LOG(INFO) << Printf("  StreamExecutor device (%d): %s, %s", i,
-                          description.name().c_str(),
-                          description.platform_version().c_str());
-    } else {
-      LOG(INFO) << Printf("  StreamExecutor device (%d) not supported", i);
+  if (execute_backend_) {
+    LOG(INFO) << Printf(
+        "XLA service %p executing computations on platform %s. Devices:", this,
+        execute_backend_->platform()->Name().c_str());
+    for (int i = 0; i < execute_backend_->device_count(); ++i) {
+      if (execute_backend_->device_ordinal_supported(i)) {
+        se::StreamExecutor* executor =
+            execute_backend_->stream_executor(i).ValueOrDie();
+        const auto& description = executor->GetDeviceDescription();
+        LOG(INFO) << Printf("  StreamExecutor device (%d): %s, %s", i,
+                            description.name().c_str(),
+                            description.platform_version().c_str());
+      } else {
+        LOG(INFO) << Printf("  StreamExecutor device (%d) not supported", i);
+      }
     }
+  } else {
+    VLOG(1) << "XLA compile-only service constructed";
   }
 }
 
@@ -286,7 +304,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-    const ExecutionOptions& execution_options) {
+    const ExecutionOptions& execution_options, Backend* backend) {
   auto module_config = MakeUnique<HloModuleConfig>(program_shape);
   auto* computation_layout = module_config->mutable_entry_computation_layout();
 
@@ -326,7 +344,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     module_config->enable_hlo_profiling(true);
   }
 
-  module_config->set_replica_count(execute_backend_->Replicas().size());
+  module_config->set_replica_count(backend->Replicas().size());
   module_config->set_fast_math_disabled(execution_options.disable_fast_math());
   module_config->set_seed(execution_options.seed());
 
@@ -474,7 +492,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
       std::unique_ptr<Executable> executable_unique_ptr,
       BuildExecutable(versioned_handle, std::move(module_config),
                       /*executable_for_compute_constant=*/false, arguments,
-                      execute_backend_.get(), executor));
+                      backend, executor));
 
   if (profile != nullptr) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -569,21 +587,21 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
-    run_options.emplace_back(options, backend->StreamBorrower());
+    run_options.emplace_back(options, backend->StreamBorrower(),
+                             backend->inter_op_thread_pool());
   }
 
   perftools::gputools::DeviceMemoryBase result;
   if (backend->Replicas().size() == 1) {
     TF_ASSIGN_OR_RETURN(
-        result,
-        ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
-            executable, &run_options[0], profile, execute_backend_.get(),
-            [&arguments](Executable* executable,
-                         const ServiceExecutableRunOptions* run_options,
-                         HloExecutionProfile* hlo_execution_profile) {
-              return executable->ExecuteOnStream(run_options, arguments,
-                                                 hlo_execution_profile);
-            }));
+        result, ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
+                    executable, &run_options[0], profile, backend,
+                    [&arguments](Executable* executable,
+                                 const ServiceExecutableRunOptions* run_options,
+                                 HloExecutionProfile* hlo_execution_profile) {
+                      return executable->ExecuteOnStream(run_options, arguments,
+                                                         hlo_execution_profile);
+                    }));
   } else {
     std::vector<
         tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
@@ -666,7 +684,8 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // the program and the argument allocations.
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
                         CreateModuleConfig(*program_shape, arg_allocations,
-                                           request.execution_options()));
+                                           request.execution_options(),
+                                           execute_backend_.get()));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
 
@@ -751,9 +770,10 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
       ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arg_allocations,
+                         arg->execution_options(), execute_backend_.get()));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -818,9 +838,10 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
       ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arg_allocations,
+                         arg->execution_options(), execute_backend_.get()));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -1141,7 +1162,8 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options));
+                      CreateModuleConfig(program_shape, {}, execution_options,
+                                         compute_constant_backend_.get()));
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<Executable> executable,
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 9600f6989a40c9180d00ccabbeb29cb37a28900a..05a955137f8dfe7aa085058c5a6673ce8f2f77f1 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -63,9 +63,14 @@ class ServiceOptions {
   ServiceOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
 
+  // Sets the thread pool size for parallel execution of an individual operator.
+  ServiceOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
  private:
   perftools::gputools::Platform* platform_ = nullptr;
   int number_of_replicas_ = -1;
+  int intra_op_parallelism_threads_ = -1;
 };
 
 // The XLA service object, which is the same across all
@@ -265,11 +270,11 @@ class Service : public ServiceInterface {
       tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
       const Backend* backend, int device_ordinal);
 
-  // Create a Hlo module config foe the given program shape and arguments.
+  // Create a Hlo module config for the given program shape and arguments.
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-      const ExecutionOptions& execution_options);
+      const ExecutionOptions& execution_options, Backend* backend);
 
   // Builds an Executable for the given parameters. If
   // executable_for_compute_constant is true, then the executable is intended to
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 0d4b214f5f3624971ae68e23f0f4fdba846f9178..017e5ef09ed2f52b862821e9408540d188a1edf5 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -30,10 +30,12 @@ class ServiceExecutableRunOptions {
   using StreamBorrower =
       std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
 
-  explicit ServiceExecutableRunOptions(ExecutableRunOptions run_options,
-                                       StreamBorrower borrow_stream = nullptr)
+  explicit ServiceExecutableRunOptions(
+      ExecutableRunOptions run_options, StreamBorrower borrow_stream = nullptr,
+      tensorflow::thread::ThreadPool* xla_intra_op_thread_pool = nullptr)
       : run_options_(std::move(run_options)),
-        borrow_stream_(std::move(borrow_stream)) {}
+        borrow_stream_(std::move(borrow_stream)),
+        xla_intra_op_thread_pool_(xla_intra_op_thread_pool) {}
 
   // Returns reference or pointer to `ExecutableRunOptions` member.
   const ExecutableRunOptions& run_options() const { return run_options_; }
@@ -53,9 +55,15 @@ class ServiceExecutableRunOptions {
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
   }
 
+  // Returns reference to thread pool for execution of XLA ops on CPU backend.
+  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool() const {
+    return xla_intra_op_thread_pool_;
+  }
+
  private:
   ExecutableRunOptions run_options_;
   StreamBorrower borrow_stream_;
+  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 338d63f1a002b490ac3017afafdf3743eb29b503..b2ef8ed486b5ab4643cb0e26fa6c18e1f3894a4b 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -244,8 +244,11 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     }
     if (ShapeUtil::Rank(*arg_shape) != ShapeUtil::Rank(*shape)) {
       return InvalidArgument(
-          "cannot concatenate arrays with different ranks: %lld vs %lld",
-          ShapeUtil::Rank(*arg_shape), ShapeUtil::Rank(*shape));
+          "Cannot concatenate arrays with different ranks: %lld (%s) vs %lld "
+          "(%s)",
+          ShapeUtil::Rank(*arg_shape),
+          ShapeUtil::HumanString(*arg_shape).c_str(), ShapeUtil::Rank(*shape),
+          ShapeUtil::HumanString(*shape).c_str());
     }
     if (arg_shape->element_type() != shape->element_type()) {
       return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index cfb90e6e1d49ff49572977d938a53593970ad912..a0c88c6bbc23972bb6a0f3729e51ee0eaee72bc7 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -76,8 +76,7 @@ using InstructionOperandsPair =
 // the parent HLO computation of `dot`.
 //
 // Returns whether the module is changed.
-bool FoldTransposeIntoDot(InstructionOperandsPair pair,
-                          HloComputation* computation) {
+bool FoldTransposeIntoDot(InstructionOperandsPair pair) {
   auto* dot = pair.first;
   std::vector<HloInstruction*> instructions_to_fuse(1, dot);
   for (const int64 operand_index : pair.second) {
@@ -89,7 +88,7 @@ bool FoldTransposeIntoDot(InstructionOperandsPair pair,
     return false;
   }
 
-  computation->CreateFusionInstruction(
+  dot->parent()->CreateFusionInstruction(
       instructions_to_fuse, HloInstruction::FusionKind::kTransposeDot);
   return true;
 }
@@ -98,8 +97,7 @@ bool FoldTransposeIntoDot(InstructionOperandsPair pair,
 // `computation` is the parent HLO computation of `convolution`.
 //
 // Returns whether the module is changed.
-bool FoldTransposeIntoConvolution(InstructionOperandsPair pair,
-                                  HloComputation* computation) {
+bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
   auto& convolution = *pair.first;
 
   // We only support fusing the RHS transpose into convolution.
@@ -135,8 +133,8 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair,
   auto new_conv = HloInstruction::CreateConvolve(
       convolution.shape(), convolution.mutable_operand(0), &transpose_operand,
       convolution.window(), new_dnums);
-  TF_CHECK_OK(computation->ReplaceWithNewInstruction(&convolution,
-                                                     std::move(new_conv)));
+  TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
+      &convolution, std::move(new_conv)));
 
   return true;
 }
@@ -152,8 +150,6 @@ TransposeFolding::TransposeFolding(
 StatusOr<bool> TransposeFolding::Run(HloModule* module) {
   // Modifying the graph while traversing is dangerous, so we find all folding
   // opportunities before actually folding them.
-  HloComputation* entry_computation = module->entry_computation();
-
   std::vector<std::pair<HloInstruction*, OperandIndices>> foldable_dots;
   std::vector<std::pair<HloInstruction*, OperandIndices>> foldable_convolutions;
   auto visit_fn = [this, &foldable_dots,
@@ -175,14 +171,17 @@ StatusOr<bool> TransposeFolding::Run(HloModule* module) {
     }
     return tensorflow::Status::OK();
   };
-  TF_RETURN_IF_ERROR(entry_computation->root_instruction()->Accept(visit_fn));
+
+  for (auto& comp : module->computations()) {
+    TF_RETURN_IF_ERROR(comp->Accept(visit_fn));
+  }
 
   bool changed = false;
   for (InstructionOperandsPair& pair : foldable_dots) {
-    changed |= FoldTransposeIntoDot(pair, entry_computation);
+    changed |= FoldTransposeIntoDot(pair);
   }
   for (InstructionOperandsPair& pair : foldable_convolutions) {
-    changed |= FoldTransposeIntoConvolution(pair, entry_computation);
+    changed |= FoldTransposeIntoConvolution(pair);
   }
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 6643f541daeb5f3dd3f36e1063eea951e604ad69..c72d127ea86e4e9daf99dff4335c538c081f0605 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -41,9 +41,7 @@ class TransposeFoldingTest : public ::testing::Test {
     TransposeFolding transpose_folding(
         [](const HloInstruction& dot,
            const TransposeFolding::OperandIndices& candidate_operands) {
-          return gpu::ImplementedAsGemm(dot)
-                     ? candidate_operands
-                     : TransposeFolding::OperandIndices{};
+          return candidate_operands;
         },
         [](const HloInstruction& convolution,
            const TransposeFolding::OperandIndices& candidate_operands) {
@@ -159,6 +157,50 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   EXPECT_EQ(6, callee_computation->instructions().size());
 }
 
+TEST_F(TransposeFoldingTest, FoldDotTransposeInWhile) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3}),
+      /*name=*/"y"));
+  HloInstruction* transpose_y =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {2, 2}), /*opcode=*/HloOpcode::kDot,
+      /*lhs=*/x, /*rhs=*/transpose_y));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(dot));
+
+  HloInstruction* call = module.OutlineExpressionFromComputation(
+      {transpose_y, dot}, "outlined", entry_computation);
+
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the fusion.
+  std::unordered_set<HloInstruction*> instruction_set;
+  for (auto& instruction : entry_computation->instructions()) {
+    instruction_set.insert(instruction.get());
+  }
+  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(call))
+      << "call is not in entry_computation.";
+  CHECK(instruction_set.empty())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* fusion =
+      call->called_computations().front()->root_instruction();
+  EXPECT_EQ(HloOpcode::kFusion, fusion->opcode());
+
+  // The fusion instruction should contain two parameters, one transpose and
+  // one dot.
+  EXPECT_EQ(4, fusion->fused_instructions().size());
+}
+
 // Test that a two dimension swap of the kernel gets folded into convolution.
 TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
   auto builder = HloComputation::Builder("entry_computation");
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 98c51b48f9022c5f2d1e23b59a6ce775f3a48e0b..554adaf0e32f7cb896e07a59d5235ff84a11bb92 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -131,10 +131,9 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index,
 }
 
 /* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
-TuplePointsToAnalysis::Run(const HloModule* module,
-                           const bool include_loop_fusion_instructions) {
+TuplePointsToAnalysis::Run(const HloModule* module) {
   std::unique_ptr<TuplePointsToAnalysis> analysis(
-      new TuplePointsToAnalysis(module, include_loop_fusion_instructions));
+      new TuplePointsToAnalysis(module));
   TF_RETURN_IF_ERROR(analysis->Analyze());
   return std::move(analysis);
 }
@@ -145,17 +144,14 @@ Status TuplePointsToAnalysis::Analyze() {
     TF_RETURN_IF_ERROR(computation->Accept(this));
     TF_RETURN_IF_ERROR(
         PopulateDefinedBuffersAndAliases(computation->instructions()));
-    if (include_loop_fusion_instructions_) {
-      // Run points-to analysis on loop fusion instructions in 'computation'.
-      for (auto& instruction : computation->instructions()) {
-        if (instruction->opcode() != HloOpcode::kFusion ||
-            instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) {
-          continue;
-        }
-        TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
-        TF_RETURN_IF_ERROR(PopulateDefinedBuffersAndAliases(
-            instruction->fused_instructions()));
+    // Run points-to analysis on fusion instructions in 'computation'.
+    for (auto& instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kFusion) {
+        continue;
       }
+      TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
+      TF_RETURN_IF_ERROR(
+          PopulateDefinedBuffersAndAliases(instruction->fused_instructions()));
     }
   }
 
@@ -482,9 +478,7 @@ string TuplePointsToAnalysis::ToString() const {
     for (const HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       InstructionToString(instruction, &output);
-      if (include_loop_fusion_instructions_ &&
-          instruction->opcode() == HloOpcode::kFusion &&
-          instruction->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+      if (instruction->opcode() == HloOpcode::kFusion) {
         for (auto& fused : instruction->fused_instructions()) {
           InstructionToString(fused.get(), &output);
         }
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index a384529171a7371c848ca8949d22cb6717d83a78..85a71b56ce5e9fb1a3441c302e18bd1fa7b68864 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -148,12 +148,9 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias);
 // the potential sources of each buffer in each instruction's output.
 class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
  public:
-  // Runs points-to analysis on 'module'. If 'include_loop_fusion_instructions'
-  // is true, includes fused instructions from each loop fusion instruction
-  // in 'module' in the points-to analysis.
+  // Runs points-to analysis on 'module'.
   static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
-      const HloModule* module,
-      const bool include_loop_fusion_instructions = false);
+      const HloModule* module);
 
   // Return the points-to set of an instruction. This describes the potential
   // sources of each buffer in the instruction's output.
@@ -218,10 +215,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   string ToString() const;
 
  private:
-  explicit TuplePointsToAnalysis(const HloModule* module,
-                                 const bool include_loop_fusion_instructions)
-      : module_(module),
-        include_loop_fusion_instructions_(include_loop_fusion_instructions) {}
+  explicit TuplePointsToAnalysis(const HloModule* module) : module_(module) {}
 
   // Perform the analysis. Should be called immediately after constructing the
   // object and before calling GetPointsToSet.
@@ -261,9 +255,6 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   // The module this analysis is performed on.
   const HloModule* module_;
 
-  // Whether to run points-to analysis on loop fusion instructions in 'module_'.
-  const bool include_loop_fusion_instructions_;
-
   // A map containing a PointsToSet for every HLO instruction.
   tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<PointsToSet>>
       points_to_;
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index 808050bdabd188a51d03141fc7ebe3500b2cf110..87e1b058b79c0dc327cc1ad63a8cffa97c190df4 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -52,11 +52,10 @@ class TuplePointsToAnalysisTest : public HloTestBase {
     module_->AddEntryComputation(std::move(computation));
   }
 
-  void RunAnalysis(const bool include_loop_fusion_instructions = false) {
+  void RunAnalysis() {
     CHECK_NOTNULL(module_.get());
-    points_to_analysis_ = TuplePointsToAnalysis::Run(
-                              module_.get(), include_loop_fusion_instructions)
-                              .ConsumeValueOrDie();
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
   }
 
   // Returns the LogicalBuffer defined at the given instruction and
@@ -609,7 +608,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
     auto* fusion = module_->entry_computation()->root_instruction();
     EXPECT_THAT(fusion, op::Fusion(tuple_param0));
     // Run points-to analysis (should include fused instructions from 'fusion').
-    RunAnalysis(/*include_loop_fusion_instructions=*/true);
+    RunAnalysis();
 
     // Check points-to set of fusion parameter associated with 'tuple_param0'.
     auto* fusion_param = GetFusionParameterForOperand(fusion, tuple_param0);
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 34e8ee8acade129f2f43a399cb807b2032cd95a6..e9fcc9fa6666bb2e3c24252e1c0f5e8d763a5d48 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -1928,6 +1928,12 @@ HloInstruction* ComputationLowerer::Visit(
 
   const OperationRequest& request =
       session_computation_.requests().at(handle.handle());
+  auto add_instruction = [&](std::unique_ptr<HloInstruction> instruction) {
+    HloInstruction* hlo_instruction =
+        hlo_builder_.AddInstruction(std::move(instruction));
+    hlo_instruction->set_metadata(request.request().metadata());
+    return hlo_instruction;
+  };
   HloInstruction* hlo_instruction;
   switch (request.request().op_case()) {
     case OpRequest::kRngRequest: {
@@ -1936,7 +1942,7 @@ HloInstruction* ComputationLowerer::Visit(
       for (const ComputationDataHandle& param : rng_request.parameter()) {
         parameters.push_back(Visit(param, visited));
       }
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateRng(
+      hlo_instruction = add_instruction(HloInstruction::CreateRng(
           request.output_shape(), rng_request.distribution(), parameters));
       break;
     }
@@ -1944,9 +1950,8 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kConstantRequest: {
       const ConstantRequest& constant_request =
           request.request().constant_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CloneToUnique(constant_request.literal())));
+      hlo_instruction = add_instruction(HloInstruction::CreateConstant(
+          LiteralUtil::CloneToUnique(constant_request.literal())));
       break;
     }
 
@@ -1955,17 +1960,15 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().get_tuple_element_request();
       HloInstruction* operand =
           Visit(get_tuple_element_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateGetTupleElement(
-              request.output_shape(), operand,
-              get_tuple_element_request.index()));
+      hlo_instruction = add_instruction(HloInstruction::CreateGetTupleElement(
+          request.output_shape(), operand, get_tuple_element_request.index()));
       break;
     }
 
     case OpRequest::kSliceRequest: {
       const SliceRequest& slice_request = request.request().slice_request();
       HloInstruction* operand = Visit(slice_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateSlice(
+      hlo_instruction = add_instruction(HloInstruction::CreateSlice(
           request.output_shape(), operand,
           AsInt64Slice(slice_request.start_indices()),
           AsInt64Slice(slice_request.limit_indices())));
@@ -1979,10 +1982,9 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* start_indices =
           Visit(dynamic_slice_request.start_indices(), visited);
 
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateDynamicSlice(
-              request.output_shape(), operand, start_indices,
-              AsInt64Slice(dynamic_slice_request.slice_sizes())));
+      hlo_instruction = add_instruction(HloInstruction::CreateDynamicSlice(
+          request.output_shape(), operand, start_indices,
+          AsInt64Slice(dynamic_slice_request.slice_sizes())));
       break;
     }
 
@@ -1996,7 +1998,7 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* start_indices =
           Visit(dynamic_update_slice_request.start_indices(), visited);
       hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          add_instruction(HloInstruction::CreateDynamicUpdateSlice(
               request.output_shape(), operand, update, start_indices));
       break;
     }
@@ -2010,9 +2012,8 @@ HloInstruction* ComputationLowerer::Visit(
         HloInstruction* operand = Visit(handle, visited);
         operands.push_back(operand);
       }
-      hlo_instruction = hlo_builder_.AddInstruction(
-          HloInstruction::CreateConcatenate(request.output_shape(), operands,
-                                            concatenate_request.dimension()));
+      hlo_instruction = add_instruction(HloInstruction::CreateConcatenate(
+          request.output_shape(), operands, concatenate_request.dimension()));
       break;
     }
 
@@ -2021,10 +2022,9 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().convolve_request();
       HloInstruction* lhs = Visit(convolve_request.lhs(), visited);
       HloInstruction* rhs = Visit(convolve_request.rhs(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateConvolve(
-              request.output_shape(), lhs, rhs, convolve_request.window(),
-              convolve_request.dimension_numbers()));
+      hlo_instruction = add_instruction(HloInstruction::CreateConvolve(
+          request.output_shape(), lhs, rhs, convolve_request.window(),
+          convolve_request.dimension_numbers()));
       break;
     }
 
@@ -2033,17 +2033,15 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().cross_replica_sum_request();
       HloInstruction* operand =
           Visit(cross_replica_sum_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-              request.output_shape(), operand));
+      hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum(
+          request.output_shape(), operand));
       break;
     }
 
     case OpRequest::kInfeedRequest: {
       const InfeedRequest& infeed_request = request.request().infeed_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateInfeed(
-              request.output_shape(), infeed_request.config()));
+      hlo_instruction = add_instruction(HloInstruction::CreateInfeed(
+          request.output_shape(), infeed_request.config()));
       break;
     }
 
@@ -2051,9 +2049,8 @@ HloInstruction* ComputationLowerer::Visit(
       const OutfeedRequest& outfeed_request =
           request.request().outfeed_request();
       HloInstruction* operand = Visit(outfeed_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
-          HloInstruction::CreateOutfeed(outfeed_request.shape(), operand,
-                                        outfeed_request.outfeed_config()));
+      hlo_instruction = add_instruction(HloInstruction::CreateOutfeed(
+          outfeed_request.shape(), operand, outfeed_request.outfeed_config()));
       break;
     }
 
@@ -2069,7 +2066,7 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* map_computation =
           ResolveComputation(map_request.to_apply(), map_version);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateMap(
+      hlo_instruction = add_instruction(HloInstruction::CreateMap(
           request.output_shape(), operands, map_computation));
       break;
     }
@@ -2083,10 +2080,9 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* reduce_computation =
           ResolveComputation(reduce_request.to_apply(), reduce_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReduce(
-              request.output_shape(), operand, init_value,
-              AsInt64Slice(reduce_request.dimensions()), reduce_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateReduce(
+          request.output_shape(), operand, init_value,
+          AsInt64Slice(reduce_request.dimensions()), reduce_computation));
       break;
     }
 
@@ -2101,10 +2097,9 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* reduce_window_computation = ResolveComputation(
           reduce_window_request.to_apply(), reduce_window_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReduceWindow(
-              request.output_shape(), operand, init_value,
-              reduce_window_request.window(), reduce_window_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateReduceWindow(
+          request.output_shape(), operand, init_value,
+          reduce_window_request.window(), reduce_window_computation));
       break;
     }
 
@@ -2126,11 +2121,10 @@ HloInstruction* ComputationLowerer::Visit(
           select_and_scatter_request.select(), select_version);
       HloComputation* scatter_computation = ResolveComputation(
           select_and_scatter_request.scatter(), scatter_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateSelectAndScatter(
-              request.output_shape(), operand, select_computation,
-              select_and_scatter_request.window(), source, init_value,
-              scatter_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateSelectAndScatter(
+          request.output_shape(), operand, select_computation,
+          select_and_scatter_request.window(), source, init_value,
+          scatter_computation));
       break;
     }
 
@@ -2151,9 +2145,8 @@ HloInstruction* ComputationLowerer::Visit(
                                        ShapeUtil::Rank(request.output_shape()) -
                                        ShapeUtil::Rank(operand->shape()));
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
-              request.output_shape(), operand, broadcast_dimensions));
+      hlo_instruction = add_instruction(HloInstruction::CreateBroadcast(
+          request.output_shape(), operand, broadcast_dimensions));
       break;
     }
 
@@ -2165,14 +2158,13 @@ HloInstruction* ComputationLowerer::Visit(
       if (IsIdentityPermutation(AsInt64Slice(reshape_request.dimensions()))) {
         transposed = operand;
       } else {
-        transposed =
-            hlo_builder_.AddInstruction(HloInstruction::CreateTranspose(
-                ShapeUtil::PermuteDimensions(InversePermutation(AsInt64Slice(
-                                                 reshape_request.dimensions())),
-                                             operand->shape()),
-                operand, AsInt64Slice(reshape_request.dimensions())));
+        transposed = add_instruction(HloInstruction::CreateTranspose(
+            ShapeUtil::PermuteDimensions(
+                InversePermutation(AsInt64Slice(reshape_request.dimensions())),
+                operand->shape()),
+            operand, AsInt64Slice(reshape_request.dimensions())));
       }
-      hlo_instruction = hlo_builder_.AddInstruction(
+      hlo_instruction = add_instruction(
           HloInstruction::CreateReshape(request.output_shape(), transposed));
       break;
     }
@@ -2181,12 +2173,11 @@ HloInstruction* ComputationLowerer::Visit(
       const TransposeRequest& transpose_request =
           request.request().transpose_request();
       HloInstruction* operand = Visit(transpose_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateTranspose(
-              ShapeUtil::PermuteDimensions(InversePermutation(AsInt64Slice(
-                                               transpose_request.dimensions())),
-                                           operand->shape()),
-              operand, AsInt64Slice(transpose_request.dimensions())));
+      hlo_instruction = add_instruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(
+              InversePermutation(AsInt64Slice(transpose_request.dimensions())),
+              operand->shape()),
+          operand, AsInt64Slice(transpose_request.dimensions())));
       break;
     }
 
@@ -2194,10 +2185,9 @@ HloInstruction* ComputationLowerer::Visit(
       const ReverseRequest& reverse_request =
           request.request().reverse_request();
       HloInstruction* operand = Visit(reverse_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReverse(
-              request.output_shape(), operand,
-              AsInt64Slice(reverse_request.dimensions())));
+      hlo_instruction = add_instruction(HloInstruction::CreateReverse(
+          request.output_shape(), operand,
+          AsInt64Slice(reverse_request.dimensions())));
       break;
     }
 
@@ -2206,7 +2196,7 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* operand = Visit(pad_request.operand(), visited);
       HloInstruction* padding_value =
           Visit(pad_request.padding_value(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreatePad(
+      hlo_instruction = add_instruction(HloInstruction::CreatePad(
           request.output_shape(), operand, padding_value,
           pad_request.padding_config()));
       break;
@@ -2214,7 +2204,7 @@ HloInstruction* ComputationLowerer::Visit(
 
     case OpRequest::kRecvRequest: {
       const RecvRequest& recv_request = request.request().recv_request();
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateRecv(
+      hlo_instruction = add_instruction(HloInstruction::CreateRecv(
           request.output_shape(), recv_request.channel_handle().handle()));
       break;
     }
@@ -2222,10 +2212,9 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kParameterRequest: {
       const ParameterRequest& parameter_request =
           request.request().parameter_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateParameter(
-              parameter_request.parameter(), request.output_shape(),
-              parameter_request.name()));
+      hlo_instruction = add_instruction(HloInstruction::CreateParameter(
+          parameter_request.parameter(), request.output_shape(),
+          parameter_request.name()));
       break;
     }
 
@@ -2233,7 +2222,7 @@ HloInstruction* ComputationLowerer::Visit(
       const ConvertRequest& convert_request =
           request.request().convert_request();
       HloInstruction* operand = Visit(convert_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
+      hlo_instruction = add_instruction(
           HloInstruction::CreateConvert(request.output_shape(), operand));
       break;
     }
@@ -2250,7 +2239,7 @@ HloInstruction* ComputationLowerer::Visit(
       HloComputation* body =
           ResolveComputation(while_request.body(), body_version);
       HloInstruction* init = Visit(while_request.init(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateWhile(
+      hlo_instruction = add_instruction(HloInstruction::CreateWhile(
           request.output_shape(), condition, body, init));
       break;
     }
@@ -2262,9 +2251,8 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* rhs = Visit(ternary_op_request.rhs(), visited);
       HloInstruction* ehs = Visit(ternary_op_request.ehs(), visited);
       auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop());
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateTernary(
-              request.output_shape(), hlo_opcode, lhs, rhs, ehs));
+      hlo_instruction = add_instruction(HloInstruction::CreateTernary(
+          request.output_shape(), hlo_opcode, lhs, rhs, ehs));
       break;
     }
 
@@ -2279,9 +2267,8 @@ HloInstruction* ComputationLowerer::Visit(
       }
       auto hlo_opcode =
           VariadicOperationToHloOpcode(variadic_op_request.varop());
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateVariadic(
-              request.output_shape(), hlo_opcode, operands));
+      hlo_instruction = add_instruction(HloInstruction::CreateVariadic(
+          request.output_shape(), hlo_opcode, operands));
       break;
     }
 
@@ -2296,7 +2283,7 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* call_computation =
           ResolveComputation(call_request.to_apply(), call_version);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateCall(
+      hlo_instruction = add_instruction(HloInstruction::CreateCall(
           request.output_shape(), operands, call_computation));
       break;
     }
@@ -2308,9 +2295,8 @@ HloInstruction* ComputationLowerer::Visit(
       for (const ComputationDataHandle& operand : cc_request.operands()) {
         operands.push_back(Visit(operand, visited));
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateCustomCall(
-              cc_request.shape(), operands, cc_request.call_target_name()));
+      hlo_instruction = add_instruction(HloInstruction::CreateCustomCall(
+          cc_request.shape(), operands, cc_request.call_target_name()));
       break;
     }
 
@@ -2319,7 +2305,7 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().unary_op_request();
       HloInstruction* operand = Visit(unary_op_request.operand(), visited);
       auto hlo_opcode = UnaryOperationToHloOpcode(unary_op_request.unop());
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateUnary(
+      hlo_instruction = add_instruction(HloInstruction::CreateUnary(
           request.output_shape(), hlo_opcode, operand));
       break;
     }
@@ -2347,23 +2333,22 @@ HloInstruction* ComputationLowerer::Visit(
         // identical to the HLO broadcast semantics so the broadcast_dimensions
         // field can just be passed to the instruction builder.
         HloInstruction* broadcasted_operand =
-            hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
+            add_instruction(HloInstruction::CreateBroadcast(
                 broadcast_shape, operand_to_broadcast,
                 AsInt64Slice(binary_op_request.broadcast_dimensions())));
 
         lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
         rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateBinary(
-              request.output_shape(), hlo_opcode, lhs, rhs));
+      hlo_instruction = add_instruction(HloInstruction::CreateBinary(
+          request.output_shape(), hlo_opcode, lhs, rhs));
       break;
     }
 
     case OpRequest::kTraceRequest: {
       const TraceRequest& trace_request = request.request().trace_request();
       HloInstruction* operand = Visit(trace_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
+      hlo_instruction = add_instruction(
           HloInstruction::CreateTrace(trace_request.tag(), operand));
       operand->set_tracing(hlo_instruction);
       break;
@@ -2372,7 +2357,7 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kSendRequest: {
       const SendRequest& send_request = request.request().send_request();
       HloInstruction* operand = Visit(send_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateSend(
+      hlo_instruction = add_instruction(HloInstruction::CreateSend(
           operand, send_request.channel_handle().handle()));
       break;
     }
@@ -2383,7 +2368,6 @@ HloInstruction* ComputationLowerer::Visit(
     default:
       LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
   }
-  hlo_instruction->set_metadata(request.request().metadata());
   (*visited)[handle.handle()] = hlo_instruction;
   return hlo_instruction;
 }
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
index 032b5cfac604a92bdf150a7fcee57e91bee65508..cf04cfde5003d70e26ce0a1543039c18c19282c9 100644
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -59,6 +59,9 @@ TEST_F(UserComputationTest, SimpleComputation) {
   param_request.set_name("param0");
   TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle param_handle,
                          computation.AddParameterInstruction(param_request));
+  OpMetadata metadata;
+  metadata.set_op_name("meta");
+  TF_ASSERT_OK(computation.SetOpMetadata(param_handle, metadata));
 
   OutfeedRequest outfeed_request;
   *outfeed_request.mutable_operand() = constant_handle;
@@ -135,6 +138,8 @@ TEST_F(UserComputationTest, SimpleComputation) {
     // The root of the instruction should be the parameter instruction (not the
     // outfeed).
     EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
+    EXPECT_EQ(hlo_computation->root_instruction()->metadata().op_name(),
+              "meta");
   }
 }
 
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 2159386152b34e4f9b59ca14faa756e37551d724..c8851d2ca512450b4022e0f70d55399323b2fa08 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -21,7 +21,10 @@ limitations under the License.
 
 namespace xla {
 
-// Defines the interface for an XLA service.
+// Defines the interface for an XLA service on the client side. This service
+// helps abstract around the actual implementation of a service - the service
+// can be local (running in the same process), or remote - in which case an RPC
+// stub is used as the implementation.
 class ServiceInterface {
  public:
   ServiceInterface() {}
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 57d91e4bfc1145faa25c9b5c57422c7653d4a163..2b32b78f0b7c39dbf16b61f17d98c81027d013b0 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <numeric>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/index_util.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -675,7 +677,7 @@ namespace {
 // Helper for ForEachSubshape which visits the subshapes of the given shape in
 // DFS pre-order starting with the index.
 Status ForEachSubshapeHelper(const Shape& shape,
-                             const ShapeUtil::VisitorFunction func,
+                             const ShapeUtil::VisitorFunction& func,
                              ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
   if (ShapeUtil::IsTuple(shape)) {
@@ -692,7 +694,7 @@ Status ForEachSubshapeHelper(const Shape& shape,
 // Helper for ForEachMutableSubshape which visits the subshapes of the given
 // shape in DFS pre-order starting with the index.
 Status ForEachMutableSubshapeHelper(
-    Shape* shape, const ShapeUtil::MutatingVisitorFunction func,
+    Shape* shape, const ShapeUtil::MutatingVisitorFunction& func,
     ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
   if (ShapeUtil::IsTuple(*shape)) {
@@ -709,13 +711,13 @@ Status ForEachMutableSubshapeHelper(
 }  // namespace
 
 /* static */ Status ShapeUtil::ForEachSubshape(const Shape& shape,
-                                               VisitorFunction func) {
+                                               const VisitorFunction& func) {
   ShapeIndex index;
   return ForEachSubshapeHelper(shape, func, &index);
 }
 
 /* static */ Status ShapeUtil::ForEachMutableSubshape(
-    Shape* shape, MutatingVisitorFunction func) {
+    Shape* shape, const MutatingVisitorFunction& func) {
   ShapeIndex index;
   return ForEachMutableSubshapeHelper(shape, func, &index);
 }
@@ -728,9 +730,17 @@ Status ForEachMutableSubshapeHelper(
     new_shape.add_dimensions(dim);
   }
   if (shape.has_layout()) {
-    new_shape.mutable_layout()->clear_minor_to_major();
+    Layout* new_layout = new_shape.mutable_layout();
+    new_layout->clear_minor_to_major();
     for (auto index : Permute(permutation, shape.layout().minor_to_major())) {
-      new_shape.mutable_layout()->add_minor_to_major(index);
+      new_layout->add_minor_to_major(index);
+    }
+    if (shape.layout().padded_dimensions_size() > 0) {
+      new_layout->clear_padded_dimensions();
+      for (auto dim :
+           Permute(permutation, shape.layout().padded_dimensions())) {
+        new_layout->add_padded_dimensions(dim);
+      }
     }
   }
   return new_shape;
@@ -1013,6 +1023,144 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
          check_input_unit_indices(output_shape, input_shape);
 }
 
+/* static */ tensorflow::gtl::optional<Shape> ShapeUtil::AlignLayouts(
+    const Shape& input_shape, const Shape& output_shape) {
+  int64 input_rank = ShapeUtil::Rank(input_shape);
+  int64 output_rank = ShapeUtil::Rank(output_shape);
+
+  // First, calculate an alignment of the dimensions. A consecutive sequence of
+  // input dimensions and output dimensions belong to the same alignment part if
+  // the products of their dimension bounds are the same. In the easiest case,
+  // an alignment part consists of one input dimension and one output dimension
+  // which both have the same dimension bound. An alignment part specifies which
+  // dimensions need to be kept together in a physical layout if we want a
+  // reshape to be a bitcast. The order of the alignment parts is defined by the
+  // physical layout of the input shape, so when we construct the layout for the
+  // output shape we just process the alignment parts in this order, and then
+  // layout the dimensions belonging to each part in descending (major to minor)
+  // order.
+
+  // Stores the input and output dimension numbers where each alignment part
+  // starts.
+  std::vector<std::pair<int64, int64>> alignment;
+  alignment.push_back({0, 0});
+
+  // Stores a mapping from the input dimension to the alignment part it belongs
+  // to.
+  std::vector<int64> dimension_to_alignment_index(input_rank);
+  int64 input_dimension_product = 1, output_dimension_product = 1;
+  for (int64 i = 0, j = 0; i < input_rank || j < output_rank;) {
+    // Check if we have reached the end of an alignment part.
+    if (input_dimension_product == output_dimension_product &&
+        input_dimension_product > 1) {
+      alignment.push_back({i, j});
+      input_dimension_product = output_dimension_product = 1;
+    }
+    if (input_dimension_product < output_dimension_product ||
+        j == output_rank) {
+      if (i == input_rank) {
+        return tensorflow::gtl::nullopt;
+      }
+      dimension_to_alignment_index[i] = alignment.size() - 1;
+      input_dimension_product *= input_shape.dimensions(i);
+      ++i;
+    } else {
+      output_dimension_product *= output_shape.dimensions(j);
+      ++j;
+    }
+  }
+  if (input_dimension_product != output_dimension_product) {
+    return tensorflow::gtl::nullopt;
+  }
+  // We also need to store an end element so that we know where the last
+  // alignment part ends.
+  alignment.push_back({input_rank, output_rank});
+
+  // Now check if the physical layout can potentially be aligned to the output
+  // shape by changing the physical layout of the output shape. We need to check
+  // that all dimension numbers that belong to the same alignment part appear
+  // consecutively, and are in descending order. However we can ignore any
+  // trivial dimension bounds of 1, because they can be placed anywhere.
+  auto input_dimension_numbers = input_shape.layout().minor_to_major();
+  std::vector<int64> output_layout;
+  output_layout.reserve(output_rank);
+  for (int64 i = 0; i < input_rank;) {
+    int64 current_dimension_number = input_dimension_numbers[i];
+
+    // Skip trivial dimensions with a bound of 1.
+    if (input_shape.dimensions(current_dimension_number) == 1) {
+      ++i;
+      continue;
+    }
+
+    // Calculate the number of non-trivial dimension bounds in the input shape
+    // belonging to the current alignment part.
+    const int64 current_alignment_index =
+        dimension_to_alignment_index[current_dimension_number];
+    // Because of the special end element that we added, we can be sure that
+    // 'current_alignment_index' is < alignment.size() - 1.
+    CHECK_LT(current_alignment_index, alignment.size() - 1);
+    int64 num_non_trivial_dimensions_in_alignment_part = 0;
+    for (int64 j = alignment[current_alignment_index].first;
+         j < alignment[current_alignment_index + 1].first; ++j) {
+      if (input_shape.dimensions(j) != 1) {
+        ++num_non_trivial_dimensions_in_alignment_part;
+      }
+    }
+
+    // Check that the following 'num_non_trivial_dimensions_in_alignment_part'
+    // dimension numbers (ignoring dimension numbers with dimension bound 1) are
+    // in descending order and belong to the current alignment part.
+    for (int64 j = 0; j < num_non_trivial_dimensions_in_alignment_part;
+         ++i, ++j) {
+      if (i == input_rank) {
+        return tensorflow::gtl::nullopt;
+      }
+      // Skip trivial dimensions with a bound of 1.
+      if (input_shape.dimensions(input_dimension_numbers[i]) == 1) {
+        --j;
+        continue;
+      }
+      // If the current dimension number belongs to a different alignment part,
+      // or the dimension numbers are not in descending order, we can return
+      // early.
+      if (dimension_to_alignment_index[input_dimension_numbers[i]] !=
+              current_alignment_index ||
+          input_dimension_numbers[i] > current_dimension_number) {
+        return tensorflow::gtl::nullopt;
+      }
+      current_dimension_number = input_dimension_numbers[i];
+    }
+
+    // The output dimension numbers that belong to the current alignment part
+    // need to appear in the same descending order as in the input. Again, we
+    // can skip dimensions with a bound of 1.
+    for (int64 j = alignment[current_alignment_index + 1].second - 1;
+         j >= alignment[current_alignment_index].second; --j) {
+      if (output_shape.dimensions(j) != 1) {
+        output_layout.push_back(j);
+      }
+    }
+  }
+  // Now add all the dimensions with dimension bound 1 at the end of
+  // 'output_layout'.
+  for (int64 i = 0; i < output_rank; ++i) {
+    if (output_shape.dimensions(i) == 1) {
+      output_layout.push_back(i);
+    }
+  }
+  CHECK_EQ(output_layout.size(), output_rank);
+  std::vector<int64> dimension_sizes;
+  for (int64 i = 0; i < output_rank; ++i) {
+    dimension_sizes.push_back(output_shape.dimensions(i));
+  }
+  Shape output_shape_with_layout = MakeShapeWithLayout(
+      output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
+      output_layout);
+  CHECK(ReshapeIsBitcast(input_shape, output_shape_with_layout));
+  return output_shape_with_layout;
+}
+
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
   shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
@@ -1047,4 +1195,31 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   return shape;
 }
 
+/* static */ void ShapeUtil::ForEachIndex(
+    const Shape& shape, tensorflow::gtl::ArraySlice<int64> base,
+    tensorflow::gtl::ArraySlice<int64> count,
+    tensorflow::gtl::ArraySlice<int64> incr,
+    const IndexVisitorFunction& visitor_function) {
+  DCHECK_EQ(Rank(shape), base.size());
+  DCHECK_EQ(incr.size(), base.size());
+  DCHECK_EQ(count.size(), base.size());
+  const Layout& layout = shape.layout();
+  int64 rank = layout.minor_to_major_size();
+  // Allows handling R0 arrays, such that the visitor function will be called
+  // once with the proper empty indexes.
+  int64 n = -1;
+  std::vector<int64> indexes(base.begin(), base.end());
+  while (n < rank && visitor_function(indexes)) {
+    // Increments dimensions in minor to major order.
+    for (n = 0; n < rank; ++n) {
+      int64 dim = layout.minor_to_major(n);
+      indexes[dim] += incr[dim];
+      if (indexes[dim] < base[dim] + count[dim]) {
+        break;
+      }
+      indexes[dim] = base[dim];
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 68e138e6aca9d2cf157466eca1ea6960e3c448e8..aaf8e84cfecb89080d690c66acd4f8d50ee17d56 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -299,13 +300,14 @@ class ShapeUtil {
   // pre-order starting with the entire shape (index {}).
   using VisitorFunction = std::function<Status(const Shape& /*subshape*/,
                                                const ShapeIndex& /*index*/)>;
-  static Status ForEachSubshape(const Shape& shape, VisitorFunction func);
+  static Status ForEachSubshape(const Shape& shape,
+                                const VisitorFunction& func);
 
   // Mutating variant of ForEachSubshape.
   using MutatingVisitorFunction =
       std::function<Status(Shape* /*subshape*/, const ShapeIndex& /*index*/)>;
   static Status ForEachMutableSubshape(Shape* shape,
-                                       MutatingVisitorFunction func);
+                                       const MutatingVisitorFunction& func);
 
   // Removes all degenerate dimensions (size one) from the given shape. The
   // stripped minor_to_major preserves the relative ordering of non-degenerate
@@ -377,6 +379,15 @@ class ShapeUtil {
   static bool ReshapeIsBitcast(const Shape& input_shape,
                                const Shape& output_shape);
 
+  // Find a physical layout for 'output_shape' such that
+  // ShapeUtil::ReshapeIsBitcast(input_shape, output_shape_with_layout) returns
+  // true (where 'output_shape_with_layout' is 'output_shape' with the found
+  // layout). The layout of 'input_shape' is kept fixed. Returns
+  // 'output_shape_with_layout' if such a layout can be found, and an error
+  // otherwise.
+  static tensorflow::gtl::optional<Shape> AlignLayouts(
+      const Shape& input_shape, const Shape& output_shape);
+
   // Returns a shape with the given dimension deleted.
   // For example:
   // • `DeleteDimension(1, T[m, n, k]) = T[m, k]`
@@ -390,6 +401,19 @@ class ShapeUtil {
   static Shape FilterDimensions(const std::function<bool(int64)>& p,
                                 Shape shape);
 
+  // Iterates through all the shape indexes, in minor to major order, starting
+  // from the base indexes, incrementing by the incr steps, up to count
+  // (index[i] < base[i] + count[i]), and calls the visitor_function with the
+  // current index.
+  // The visitor_function visitor function should return true if it wants to
+  // continue, or false otherwise.
+  using IndexVisitorFunction = std::function<bool(const std::vector<int64>&)>;
+  static void ForEachIndex(const Shape& shape,
+                           tensorflow::gtl::ArraySlice<int64> base,
+                           tensorflow::gtl::ArraySlice<int64> count,
+                           tensorflow::gtl::ArraySlice<int64> incr,
+                           const IndexVisitorFunction& visitor_function);
+
  private:
   // Validates all of the non-layout properties of the shape -- this is a helper
   // used by both the layout-optional and layout-required public method.
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index b0a4b0c9a71ae8564d80d41169a4b3ab6af82e79..73538b8b88ecf14c00854d3c31715af8189bc21d 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -20,10 +20,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+
 TEST(ShapeUtilTest, GetDimensionHelperCanNegativeIndex) {
   Shape matrix = ShapeUtil::MakeShape(F32, {2, 3});
   EXPECT_EQ(3, ShapeUtil::GetDimension(matrix, -1));
@@ -446,21 +449,21 @@ TEST(ShapeUtilTest, InsertedOrDeleted1SizedDimensions) {
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1x1_to_1x1x1) {
   // All output dimensions should be unmodified. One of the input dimensions is
   // modified because the input rank is larger by one.
-  EXPECT_EQ(3,
-            ShapeUtil::DimensionsUnmodifiedByReshape(
-                ShapeUtil::MakeShape(S32, {1, 1, 1, 1}),
-                ShapeUtil::MakeShape(S32, {1, 1, 1}))
-                .size());
+  EXPECT_THAT(ShapeUtil::DimensionsUnmodifiedByReshape(
+                  ShapeUtil::MakeShape(S32, {1, 1, 1, 1}),
+                  ShapeUtil::MakeShape(S32, {1, 1, 1})),
+              ElementsAre(std::make_pair(0, 0), std::make_pair(1, 1),
+                          std::make_pair(2, 2)));
 }
 
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1_to_1x1x1x1) {
   // All input dimensions should be unmodified. One of the output dimensions is
   // modified because the output rank is larger by one.
-  EXPECT_EQ(3,
-            ShapeUtil::DimensionsUnmodifiedByReshape(
-                ShapeUtil::MakeShape(S32, {1, 1, 1}),
-                ShapeUtil::MakeShape(S32, {1, 1, 1, 1}))
-                .size());
+  EXPECT_THAT(ShapeUtil::DimensionsUnmodifiedByReshape(
+                  ShapeUtil::MakeShape(S32, {1, 1, 1}),
+                  ShapeUtil::MakeShape(S32, {1, 1, 1, 1})),
+              ElementsAre(std::make_pair(0, 0), std::make_pair(1, 1),
+                          std::make_pair(2, 2)));
 }
 
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_4x1x3x5x6x7_to_2x6x1x5x1x42) {
@@ -468,11 +471,10 @@ TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_4x1x3x5x6x7_to_2x6x1x5x1x42) {
   // 4, 1, 3, 5, 6, 7
   //          |
   // 2, 6, 1, 5, 1, 42
-  EXPECT_TRUE(
-      ContainersEqual(ShapeUtil::DimensionsUnmodifiedByReshape(
-                          ShapeUtil::MakeShape(S32, {4, 1, 3, 5, 6, 7}),
-                          ShapeUtil::MakeShape(S32, {2, 6, 1, 5, 1, 42})),
-                      std::vector<std::pair<int64, int64>>({{3, 3}})));
+  EXPECT_THAT(ShapeUtil::DimensionsUnmodifiedByReshape(
+                  ShapeUtil::MakeShape(S32, {4, 1, 3, 5, 6, 7}),
+                  ShapeUtil::MakeShape(S32, {2, 6, 1, 5, 1, 42})),
+              ElementsAre(std::make_pair(3, 3)));
 }
 
 TEST(ShapeUtilTest, ReshapeIsBitcast_3x4_6x2) {
@@ -521,5 +523,58 @@ TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) {
       ShapeUtil::MakeShapeWithLayout(F32, {6, 2}, {0, 1})));
 }
 
+TEST(AlignmentTest, AlignLayoutsWithoutTrivialDimensions) {
+  Shape input = ShapeUtil::MakeShapeWithLayout(xla::F32, {3, 8, 5, 7, 11},
+                                               {3, 2, 1, 0, 4});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {4, 3, 2, 7, 5, 11}));
+  EXPECT_TRUE(aligned_shape);
+  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
+              ElementsAre(4, 3, 2, 1, 0, 5));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+
+  aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {3, 2, 4, 35, 11}));
+  EXPECT_TRUE(aligned_shape);
+  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
+              ElementsAre(3, 2, 1, 0, 4));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+}
+
+TEST(AlignmentTest, AlignLayoutsWithTrivialDimensions) {
+  Shape input =
+      ShapeUtil::MakeShapeWithLayout(xla::F32, {1, 3, 8, 1, 5, 7, 1, 11, 1, 1},
+                                     {5, 0, 4, 2, 1, 3, 6, 7, 9, 8});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {1, 4, 1, 3, 2, 7, 5, 11, 1}));
+  EXPECT_TRUE(aligned_shape);
+  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
+              ElementsAre(6, 5, 4, 3, 1, 7, 0, 2, 8));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+}
+
+// A test case where the consecutive elements of the input shape belonging to
+// the same layout part are not in descending order.
+TEST(AlignmentTest, AlignLayoutsWithoutTrivialDimensionsWrongInputLayout) {
+  // Same physical layout as in AlignLayoutsWithoutTrivialDimensions, except
+  // that the first two dimension numbers are exchanged.
+  Shape input = ShapeUtil::MakeShapeWithLayout(xla::F32, {3, 8, 5, 7, 11},
+                                               {2, 3, 1, 0, 4});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {4, 3, 2, 7, 5, 11}));
+  EXPECT_FALSE(aligned_shape);
+}
+
+// A test case where the physical layout of the input shape does not place all
+// dimensions that belong to the same alignment part consecutively.
+TEST(AlignmentTest,
+     AlignLayoutsWithoutTrivialDimensionsNonConsecutiveAlignmentPart) {
+  Shape input = ShapeUtil::MakeShapeWithLayout(xla::F32, {3, 8, 5, 7, 11},
+                                               {3, 2, 1, 0, 4});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {4, 3, 2, 5, 77}));
+  EXPECT_FALSE(aligned_shape);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/status_macros_test.cc b/tensorflow/compiler/xla/status_macros_test.cc
index 5563159776d11fda83aef86efb2480952689ef9d..dead17cdfa1e9f19e0ecfbc071e74e159ae82b5f 100644
--- a/tensorflow/compiler/xla/status_macros_test.cc
+++ b/tensorflow/compiler/xla/status_macros_test.cc
@@ -73,7 +73,7 @@ Status ReturnStatusError() { return (tensorflow::errors::Internal("foobar")); }
 
 using StatusReturningFunction = std::function<Status()>;
 
-StatusOr<int> CallStatusReturningFunction(StatusReturningFunction func) {
+StatusOr<int> CallStatusReturningFunction(const StatusReturningFunction& func) {
   TF_RETURN_IF_ERROR(func());
   return 42;
 }
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 12bc1e995611c244d830c6306725f6b34fdafd12..e0c2b9ab09c28a7b7a31917b9250bdca8016d1e0 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -200,11 +200,13 @@ cc_library(
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/service:pool",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//third_party/eigen3",
     ],
 )
 
@@ -891,6 +893,7 @@ xla_test(
     name = "copy_test",
     srcs = ["copy_test.cc"],
     deps = [
+        ":client_library_test_base",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
@@ -1206,12 +1209,12 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/legacy_flags:hlo_pass_pipeline_flags",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -1361,6 +1364,7 @@ cc_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/service:computation_tracker",
         "//tensorflow/compiler/xla/service:local_service",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index e682f285e03b8b48cbb1aae34edd738fc723a944..2c748b6a7ee5bcd53fa89dbc9064eef8e5ee94a3 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -486,6 +486,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementS32s) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
+TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
+  // Disable fast-math because we're operating on NaNs.
+  SetFastMathDisabled(true);
+
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
+  auto rhs = builder.ConstantR1<float>({10.0f, 25.5f, 1.0f, 10.0f, NAN});
+  auto compare = builder.Ne(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {true, false, true, true, true}, {});
+}
+
 TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index c8030aade8c7c4d96658045f996801380289f2bf..0ad1cf3e8cfa69b07db18a80be093e44144b953c 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -127,6 +127,251 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) {
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
+struct R3ImplicitBroadcastSpec {
+  std::array<int64, 3> output_bounds;
+  std::array<int64, 3> minor2major_layout;
+  std::array<int64, 3> input_bounds;
+  HloOpcode op;
+} kR3ImplicitBroadcastTestCases[] = {
+    {{{1, 1, 1}}, {{2, 1, 0}}, {{1, 1, 1}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 1, 5}}, HloOpcode::kMaximum},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 4, 1}}, HloOpcode::kMinimum},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{3, 1, 1}}, HloOpcode::kMultiply},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 1, 1}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 4, 5}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{3, 4, 1}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{3, 1, 5}}, HloOpcode::kAdd},
+    {{{3, 199, 5}}, {{2, 1, 0}}, {{1, 199, 1}}, HloOpcode::kMinimum},
+    {{{3, 4, 199}}, {{2, 1, 0}}, {{1, 1, 199}}, HloOpcode::kAdd},
+};
+
+class BroadcastR3ImplicitTest
+    : public BroadcastSimpleTest,
+      public ::testing::WithParamInterface<R3ImplicitBroadcastSpec> {};
+
+XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
+  const R3ImplicitBroadcastSpec& spec = GetParam();
+  ComputationBuilder builder(client_, TestName());
+  const Shape r3_shape = ShapeUtil::MakeShapeWithLayout(
+      F32, spec.output_bounds, spec.minor2major_layout);
+  Array3D<float> r3_array(spec.output_bounds[0], spec.output_bounds[1],
+                          spec.output_bounds[2]);
+  r3_array.FillRandom(1.0, 2.5, 56789);
+  auto r3_input =
+      LiteralUtil::Relayout(*LiteralUtil::CreateR3FromArray3D(r3_array),
+                            LayoutUtil::MakeLayout(spec.minor2major_layout));
+  std::unique_ptr<GlobalData> r3_global_data =
+      client_->TransferToServer(*r3_input).ConsumeValueOrDie();
+
+  const Shape r3_implicit_shape = ShapeUtil::MakeShapeWithLayout(
+      F32, spec.input_bounds, spec.minor2major_layout);
+  Array3D<float> r3_implicit_array(spec.input_bounds[0], spec.input_bounds[1],
+                                   spec.input_bounds[2]);
+  r3_implicit_array.FillRandom(1.0, 0.2, 56789);
+  auto r3_implicit_input = LiteralUtil::Relayout(
+      *LiteralUtil::CreateR3FromArray3D(r3_implicit_array),
+      LayoutUtil::MakeLayout(spec.minor2major_layout));
+  std::unique_ptr<GlobalData> r3_implicit_global_data =
+      client_->TransferToServer(*r3_implicit_input).ConsumeValueOrDie();
+
+  auto r3_implicit_parameter = builder.Parameter(0, r3_implicit_shape, "input");
+  auto r3_parameter = builder.Parameter(1, r3_shape, "input");
+  ComputationDataHandle op;
+  switch (spec.op) {
+    case HloOpcode::kMinimum: {
+      auto tmp_op = builder.Min(r3_implicit_parameter, r3_parameter);
+      op.Swap(&tmp_op);
+      break;
+    }
+    case HloOpcode::kMaximum: {
+      auto tmp_op = builder.Max(r3_implicit_parameter, r3_parameter);
+      op.Swap(&tmp_op);
+      break;
+    }
+    case HloOpcode::kMultiply: {
+      auto tmp_op = builder.Mul(r3_implicit_parameter, r3_parameter);
+      op.Swap(&tmp_op);
+      break;
+    }
+    default: {
+      // Default to Add
+      auto tmp_op = builder.Add(r3_implicit_parameter, r3_parameter);
+      op.Swap(&tmp_op);
+    }
+  }
+
+  Array3D<float> expected_array(spec.output_bounds[0], spec.output_bounds[1],
+                                spec.output_bounds[2]);
+  auto Each = ([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
+    float r3_implicit = r3_implicit_array(indices[0] % spec.input_bounds[0],
+                                          indices[1] % spec.input_bounds[1],
+                                          indices[2] % spec.input_bounds[2]);
+    float r3 = r3_array(indices[0], indices[1], indices[2]);
+    switch (spec.op) {
+      case HloOpcode::kMinimum: {
+        *value = std::min(r3_implicit, r3);
+        break;
+      }
+      case HloOpcode::kMaximum: {
+        *value = std::max(r3_implicit, r3);
+        break;
+      }
+      case HloOpcode::kMultiply: {
+        *value = r3_implicit * r3;
+        break;
+      }
+      default: {
+        // Default to Add
+        *value = r3_implicit + r3;
+        break;
+      }
+    }
+  });
+
+  int n1 = expected_array.n1();
+  int n2 = expected_array.n2();
+  int n3 = expected_array.n3();
+  for (int64 i = 0; i < n1; i++) {
+    for (int64 j = 0; j < n2; j++) {
+      for (int64 k = 0; k < n3; k++) {
+        Each({i, j, k}, &expected_array(i, j, k));
+      }
+    }
+  }
+  auto expected = LiteralUtil::CreateR3FromArray3D(expected_array);
+  ComputeAndCompareLiteral(
+      &builder, *expected,
+      {r3_implicit_global_data.get(), r3_global_data.get()},
+      ErrorSpec(1e-7, 1e-7));
+}
+
+INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances,
+                        BroadcastR3ImplicitTest,
+                        ::testing::ValuesIn(kR3ImplicitBroadcastTestCases));
+
+// r1 and r3's dim0 matches, and r1's dim1 and dim2 have size 1:
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
+  ComputationBuilder b(client_, TestName());
+  ComputationDataHandle r1h;
+  ComputationDataHandle r3h;
+
+  Array3D<float> r1d = {{{1}}, {{2}}};
+  Array3D<float> r3d = {{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}};
+  auto r1 = CreateR3Parameter(r1d, 1, "r1", &b, &r1h);
+  auto r3 = CreateR3Parameter(r3d, 0, "r3", &b, &r3h);
+
+  b.Add(r3h, r1h);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {4, 5}}, {{7, 8}, {9, 10}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {r3.get(), r1.get()},
+                           ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 4}, {4, 6}}, {{6, 8}, {8, 10}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1}, {2}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {5, 6}}, {{6, 7}, {9, 10}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 =
+      b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 4}, {6, 8}}, {{6, 8}, {10, 12}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 =
+      b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}}, {{3, 4}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 4}, {4, 6}}, {{8, 10}, {10, 12}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1}, {2}}, {{3}, {4}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {5, 6}}, {{8, 9}, {11, 12}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {4, 5}}, {{6, 7}, {8, 9}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}}));
+  auto r2 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
+  b.Add(r2, r1);
+
+  auto expected = LiteralUtil::CreateR2<float>({{2, 4}, {4, 6}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1}, {2}}));
+  auto r2 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
+  b.Add(r2, r1);
+
+  auto expected = LiteralUtil::CreateR2<float>({{2, 3}, {5, 6}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
   ComputationBuilder b(client_, TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 4170e0f4e2942bc71ddfa3d0f3a9d86ce2ecc823..1d998fe33ebf71a2b35f99a51038e874edacc046 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -17,18 +17,18 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -38,21 +38,60 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ComputeConstantTest : public ClientLibraryTestBase {
+// An enumerator for the client types that we want to iterate over in
+// the various tests.
+enum class ClientType { kLocal, kCompileOnly };
+ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly};
+
+class ComputeConstantTest : public ::testing::Test {
  public:
+  explicit ComputeConstantTest(
+      perftools::gputools::Platform* platform = nullptr,
+      tensorflow::gtl::ArraySlice<string> disabled_pass_names = {})
+      : platform_(platform) {
+    legacy_flags::HloPassPipelineFlags* flags =
+        legacy_flags::GetHloPassPipelineFlags();
+    flags->xla_disable_hlo_passes =
+        tensorflow::str_util::Join(disabled_pass_names, ",");
+  }
+
+  string TestName() const {
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+
+  Client* ClientOrDie(::perftools::gputools::Platform* platform,
+                      ClientType client_type) {
+    if (client_type == ClientType::kLocal) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateLocalClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create LocalClient for testing";
+      return result.ValueOrDie();
+    } else if (client_type == ClientType::kCompileOnly) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateCompileOnlyClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create CompileOnlyClient for testing";
+      return result.ValueOrDie();
+    }
+    LOG(FATAL) << "invalid client_type value";
+  }
+
   StatusOr<std::unique_ptr<Literal>> ComputeConstantLiteral(
-      ComputationDataHandle operand, ComputationBuilder* builder,
-      Layout* output_layout = nullptr) {
+      Client* client, const ComputationDataHandle& operand,
+      ComputationBuilder* builder, Layout* output_layout = nullptr) {
     TF_ASSIGN_OR_RETURN(auto remote_computed,
                         builder->ComputeConstant(operand, output_layout));
-    TF_ASSIGN_OR_RETURN(auto computed, client_->Transfer(*remote_computed));
+    TF_ASSIGN_OR_RETURN(auto computed, client->Transfer(*remote_computed));
     return std::move(computed);
   }
 
   template <class Scalar>
-  StatusOr<Scalar> ComputeConstantScalar(ComputationDataHandle operand,
+  StatusOr<Scalar> ComputeConstantScalar(Client* client,
+                                         const ComputationDataHandle& operand,
                                          ComputationBuilder* builder) {
-    TF_ASSIGN_OR_RETURN(auto literal, ComputeConstantLiteral(operand, builder));
+    TF_ASSIGN_OR_RETURN(auto literal,
+                        ComputeConstantLiteral(client, operand, builder));
     return LiteralUtil::Get<Scalar>(*literal, {});
   }
 
@@ -63,163 +102,188 @@ class ComputeConstantTest : public ClientLibraryTestBase {
     return result.ok() ? result.ValueOrDie() : false;
   }
 
-  template <class Scalar>
-  void ExpectConstantComputedScalar(ComputationDataHandle operand,
-                                    Scalar expected,
-                                    ComputationBuilder* builder) {
-    Scalar computed = ComputeConstantScalar<Scalar>(operand, builder);
-    ASSERT_TRUE(computed.ok()) << computed.status();
-    std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0(expected);
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
-  }
+  perftools::gputools::Platform* platform_;
 };
 
 TEST_F(ComputeConstantTest, ScalarInt32Literal) {
-  ComputationBuilder b(client_, TestName());
-  auto computation = b.ConstantR0<int32>(42);
-  EXPECT_TRUE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<int32>(computation, &b);
-  ASSERT_TRUE(value.ok()) << value.status();
-  EXPECT_EQ(value.ValueOrDie(), 42);
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation = b.ConstantR0<int32>(42);
+    EXPECT_TRUE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<int32>(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 42);
+  }
 }
 
 TEST_F(ComputeConstantTest, ScalarFloatAdd) {
-  ComputationBuilder b(client_, TestName());
-  auto computation =
-      b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
-  EXPECT_TRUE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  ASSERT_TRUE(value.ok()) << value.status();
-  EXPECT_EQ(value.ValueOrDie(), 44.0f);
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation =
+        b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
+    EXPECT_TRUE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 44.0f);
+  }
 }
 
 TEST_F(ComputeConstantTest, ScalarRng) {
-  ComputationBuilder b(client_, TestName());
-  auto computation =
-      b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
-                   ShapeUtil::MakeShape(F32, {}));
-  EXPECT_FALSE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  ASSERT_FALSE(value.ok())
-      << "computing a RNG value should not be considered a constant";
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation =
+        b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
+                     ShapeUtil::MakeShape(F32, {}));
+    EXPECT_FALSE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    ASSERT_FALSE(value.ok())
+        << "computing a RNG value should not be considered a constant";
+  }
 }
 
 TEST_F(ComputeConstantTest, DirectParam) {
-  ComputationBuilder b(client_, TestName());
-  auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
-  EXPECT_FALSE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                  .contains("depends on parameter"))
-      << value.status();
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
+    EXPECT_FALSE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
+                    .contains("depends on parameter"))
+        << value.status();
+  }
 }
 
 TEST_F(ComputeConstantTest, IndirectParam) {
-  ComputationBuilder b(client_, TestName());
-  auto computation =
-      b.Add(b.ConstantR0<float>(1.0f),
-            b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
-  EXPECT_FALSE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                  .contains("depends on parameter"))
-      << value.status();
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation =
+        b.Add(b.ConstantR0<float>(1.0f),
+              b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
+    EXPECT_FALSE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
+                    .contains("depends on parameter"))
+        << value.status();
+  }
 }
 
 // Test computation of an expression interspersed with param nodes but
 // the expression does not depend on the param nodes.
 TEST_F(ComputeConstantTest, UnrelatedParam) {
-  ComputationBuilder b(client_, TestName());
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
 
-  auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
-  auto constant_4 = b.Add(b.ConstantR0<float>(2.5f), b.ConstantR0<float>(1.5f));
-  auto not_constant_a = b.Add(constant_4, param_a);
+    auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
+    auto constant_4 =
+        b.Add(b.ConstantR0<float>(2.5f), b.ConstantR0<float>(1.5f));
+    auto not_constant_a = b.Add(constant_4, param_a);
 
-  auto param_b = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "param1");
-  auto constant_9 = b.Mul(b.ConstantR0<float>(2.0f), b.ConstantR0<float>(4.5f));
-  auto not_constant_b = b.Add(param_b, constant_9);
+    auto param_b = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "param1");
+    auto constant_9 =
+        b.Mul(b.ConstantR0<float>(2.0f), b.ConstantR0<float>(4.5f));
+    auto not_constant_b = b.Add(param_b, constant_9);
 
-  auto constant_13 = b.Add(constant_4, constant_9);
-  b.Add(not_constant_b, b.Add(constant_13, not_constant_a));
+    auto constant_13 = b.Add(constant_4, constant_9);
+    b.Add(not_constant_b, b.Add(constant_13, not_constant_a));
 
-  EXPECT_TRUE(IsConstant(constant_13, &b));
+    EXPECT_TRUE(IsConstant(constant_13, &b));
 
-  auto value = ComputeConstantScalar<float>(constant_13, &b);
-  ASSERT_TRUE(value.ok()) << value.status();
-  EXPECT_EQ(value.ValueOrDie(), 13.0f);
+    auto value = ComputeConstantScalar<float>(client, constant_13, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 13.0f);
+  }
 }
 
 TEST_F(ComputeConstantTest, NonScalarAdd) {
-  ComputationBuilder b(client_, TestName());
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
 
-  auto computation =
-      b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
-  EXPECT_TRUE(IsConstant(computation, &b));
+    auto computation =
+        b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
+    EXPECT_TRUE(IsConstant(computation, &b));
 
-  auto computed = ComputeConstantLiteral(computation, &b);
-  ASSERT_TRUE(computed.ok()) << computed.status();
-  std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR1<int32>({4, 6});
-  LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    auto computed = ComputeConstantLiteral(client, computation, &b);
+    ASSERT_TRUE(computed.ok()) << computed.status();
+    std::unique_ptr<Literal> expected_literal =
+        LiteralUtil::CreateR1<int32>({4, 6});
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+  }
 }
 
 TEST_F(ComputeConstantTest, IntegerDivide) {
-  ComputationBuilder b(client_, TestName());
-  auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
-  EXPECT_TRUE(IsConstant(computation, &b));
-
-  auto computed = ComputeConstantLiteral(computation, &b);
-  ASSERT_TRUE(computed.ok()) << computed.status();
-  std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0<int32>(5);
-  LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
-}
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
+    EXPECT_TRUE(IsConstant(computation, &b));
 
-XLA_TEST_F(ComputeConstantTest, Layout) {
-  ComputationBuilder b(client_, TestName());
-
-  std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
-  for (const std::vector<int64>& layout : layouts) {
-    auto layout_proto = LayoutUtil::MakeLayout(layout);
-    auto computed =
-        ComputeConstantLiteral(b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
-                                     b.ConstantR2<int32>({{10, 20}, {30, 40}})),
-                               &b, &layout_proto);
+    auto computed = ComputeConstantLiteral(client, computation, &b);
     ASSERT_TRUE(computed.ok()) << computed.status();
-
-    std::unique_ptr<Literal> expected_literal =
-        test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
-                                                     layout);
-    LiteralTestUtil::AssertEqualShapesAndLayouts(
-        expected_literal->shape(), computed.ValueOrDie()->shape());
+    std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0<int32>(5);
     LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
   }
 }
 
+XLA_TEST_F(ComputeConstantTest, Layout) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+
+    std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
+    for (const std::vector<int64>& layout : layouts) {
+      auto layout_proto = LayoutUtil::MakeLayout(layout);
+      auto computed = ComputeConstantLiteral(
+          client,
+          b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
+                b.ConstantR2<int32>({{10, 20}, {30, 40}})),
+          &b, &layout_proto);
+      ASSERT_TRUE(computed.ok()) << computed.status();
+
+      std::unique_ptr<Literal> expected_literal =
+          test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
+                                                       layout);
+      LiteralTestUtil::AssertEqualShapesAndLayouts(
+          expected_literal->shape(), computed.ValueOrDie()->shape());
+      LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    }
+  }
+}
+
 // This test is permanently disabled on CPU because it requires that the
 // backend used for execution is different than the backend used for
 // ComputeConstant which is always cpu.
 TEST_F(ComputeConstantTest, DISABLED_ON_CPU(ReuseComputedConstant)) {
   // Compute a trivial constant, then try to use the value in an Execute
   // call. This should fail because the constant resides on the CPU and the
-  // Execute call is executed on a different backend.
-  ComputationBuilder constant_b(client_, TestName());
+  // Execute call is executed on a different backend.  This test only makes
+  // sense with LocalClient, since CompileOnlyClient does not support
+  // execution.
+  Client* client = ClientOrDie(platform_, ClientType::kLocal);
+  ComputationBuilder constant_b(client, TestName());
   auto constant = constant_b.ConstantR0<int32>(42);
   auto handle = constant_b.ComputeConstant(constant).ConsumeValueOrDie();
-  auto literal = client_->Transfer(*handle).ConsumeValueOrDie();
+  auto literal = client->Transfer(*handle).ConsumeValueOrDie();
   LiteralTestUtil::ExpectR0Equal(42, *literal);
 
   // Build trivial computation which takes one parameter.
-  ComputationBuilder b(client_, TestName());
+  ComputationBuilder b(client, TestName());
   b.Neg(b.Parameter(0, ShapeUtil::MakeShape(S32, {}), "param0"));
   auto computation = b.Build().ConsumeValueOrDie();
 
   // Try to use value from ComputeConstant in Execute.
-  auto execute_status = client_->Execute(computation, {handle.get()});
+  auto execute_status = client->Execute(computation, {handle.get()});
   EXPECT_FALSE(execute_status.ok());
   EXPECT_THAT(
       execute_status.status().error_message(),
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index e645e2336190c706912f94c0662bca08f5dc281a..63bfac441d3c1f7aa257a7f9fc81df98f47551d5 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -57,6 +57,15 @@ XLA_TEST_F(ConcatTest, Concat_R1_With_Nothing) {
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
+XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({});
+  auto concatenated = builder.ConcatInDim({a}, 0);
+
+  std::vector<float> expected = {};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
 // Show that we can't concatenate R0 with R0 because we can't name the dimension
 // to concatenate on.
 XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) {
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 29e29505333b64926cdd0b3e9fe7ef3407eaaec2..8ea97e67d640d97baa70cddf60f3336a8849552a 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -256,6 +257,22 @@ XLA_TEST_F(CopyOpTest, CopyConstantR4Layout0312_MultipleTilesPerLayer) {
   TestCopyConstantLayoutR4(2, 14, 5, 35, {0, 3, 1, 2});
 }
 
+using CopyOpClientTest = ClientLibraryTestBase;
+
+XLA_TEST_F(CopyOpClientTest, Copy0x0) {
+  Shape in_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {0, 1});
+  Shape out_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {1, 0});
+  auto empty = LiteralUtil::CreateFromShape(in_shape);
+
+  ComputationBuilder builder(client_, TestName());
+  auto param0 = builder.Parameter(0, in_shape, "input");
+  auto input_data = client_->TransferToServer(*empty).ConsumeValueOrDie();
+
+  auto actual = ExecuteAndTransfer(&builder, {input_data.get()}, &out_shape)
+                    .ConsumeValueOrDie();
+  LiteralTestUtil::ExpectEqual(*empty, *actual);
+}
+
 }  // namespace
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index dc54c9defec2394049c38781a8d02fc8bd05158a..8b5b38b0b4b9d91f9491648e9c6ee6301ed74ff7 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -29,22 +29,23 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
 
-extern "C" void __attribute__((visibility("default")))
+extern "C" void TF_EXPORT
 R0F32Add2(float* out, float** in) {
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float*));
   *out = **in + 2.0f;
 }
 
-extern "C" void __attribute__((visibility("default")))
+extern "C" void TF_EXPORT
 R2F32ReduceSum(float* out, float** in) {
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float) * 4);
   float* array = in[0];
   *out = array[0] + array[1] + array[2] + array[3];
 }
 
-extern "C" void __attribute__((visibility("default")))
+extern "C" void TF_EXPORT
 Add1ToValues(float* out, float** in) {
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float) * 4);
   float* array = in[0];
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index 94f34f753b7ff8799cf9b505e1a762c9ba640389..cc3c4a2a5e115d7791e8574f4ead17f77dcd5e7c 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -52,7 +52,7 @@ TEST_F(DeallocationTest, DeallocateScalar) {
   builder.ConstantR0<float>(42.0);
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
-  // A result can be transfered an arbitrary number of times.  Add an extra
+  // A result can be transferred an arbitrary number of times.  Add an extra
   // transfer here so we're not just testing that a second call to Transfer
   // fails.
   ASSERT_IS_OK(client_->Transfer(*global_data).status());
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 180e8514102d115a169b327a26a544bbeb1c8499..cdb4498f4ed1e4f7fb2ad7a29a1cec4e26b76ed3 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -109,7 +109,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   template <typename IndexT>
   void RunR1(const std::vector<float>& input_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<int64> slice_sizes,
+             const std::vector<int64>& slice_sizes,
              const std::vector<float>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
@@ -127,7 +127,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   template <typename IndexT>
   void RunR2(const Array2D<float>& input_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<int64> slice_sizes,
+             const std::vector<int64>& slice_sizes,
              const Array2D<float>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
@@ -145,7 +145,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   template <typename IndexT>
   void RunR3(const Array3D<float>& input_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<int64> slice_sizes,
+             const std::vector<int64>& slice_sizes,
              const Array3D<float>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 4e956bc00c8fcbf0cd200bc2ae5b8f4ccfe63694..f741ff38b55933291e6b0c942efc4a37c61a8f4b 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -111,8 +111,9 @@ StatusOr<se::DeviceMemoryBase> HloTestBase::Execute(
       backend_->eigen_intra_op_thread_pool_device());
 
   HloExecutionProfile hlo_execution_profile;
-  ServiceExecutableRunOptions service_run_options(run_options,
-                                                  backend_->StreamBorrower());
+  ServiceExecutableRunOptions service_run_options(
+      run_options, backend_->StreamBorrower(),
+      backend_->inter_op_thread_pool());
   TF_ASSIGN_OR_RETURN(
       se::DeviceMemoryBase result,
       executable->ExecuteOnStream(&service_run_options, arguments,
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index ef81db6fd66502f9debf180b418d9c30917109aa..23453db57bc4a5db0d3a4f7c327e3313333d1ae2 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -314,7 +314,7 @@ class NearComparator {
 
  private:
   // EXPECTs that the two given scalar values are within the error bound. Keeps
-  // track of how many mismatches have occured to keep the size of the output
+  // track of how many mismatches have occurred to keep the size of the output
   // manageable.
   template <typename NativeT>
   bool ExpectValuesNear(NativeT expected, NativeT actual) {
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index aeadc023cc0649cb8e69c3aa981d7f347b3a1a1f..4f98083033310baf6ec95de0d2331d1aff8f3f7d 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <initializer_list>
 #include <memory>
+#include <random>
 #include <string>
 
 #include "tensorflow/compiler/xla/array2d.h"
@@ -171,6 +172,36 @@ class LiteralTestUtil {
       tensorflow::gtl::ArraySlice<int64> minor_to_major,
       const Literal& literal);
 
+  // Creates a literal with the supplied shape, and uses the provided value
+  // generator to populate the literal's values.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape,
+      const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation, and using the engine as entropy generator.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type, typename E,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, E* engine, T mean, T stddev);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, T mean, T stddev);
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralTestUtil);
 };
@@ -270,6 +301,40 @@ template <typename NativeT>
   ExpectNear(*LiteralUtil::CreateR4FromArray4D(expected), actual, error);
 }
 
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(
+    const Shape& shape,
+    const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  TF_RET_CHECK(shape.element_type() == type);
+  std::unique_ptr<Literal> literal = LiteralUtil::CreateFromShape(shape);
+  TF_RETURN_IF_ERROR(LiteralUtil::Populate<NativeT>(
+      literal.get(), [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+        return generator(indexes);
+      }));
+  return std::move(literal);
+}
+
+template <PrimitiveType type, typename E, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(const Shape& shape, E* engine, T mean,
+                                     T stddev) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  std::normal_distribution<NativeT> generator(mean, stddev);
+  return CreateRandomLiteral<type, NativeT>(
+      shape, [&](tensorflow::gtl::ArraySlice<int64> /*indexes*/) {
+        return generator(*engine);
+      });
+}
+
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(const Shape& shape, T mean, T stddev) {
+  std::minstd_rand0 engine;
+  return CreateRandomLiteral<type>(shape, &engine, mean, stddev);
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_LITERAL_TEST_UTIL_H_
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 7ea83a9e956ca8b5bb26ea6aaa844d2b63107328..52816dc72cc4d094054b2aea72f0cc63c7ff478d 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -42,7 +42,7 @@ xla::Computation Doubler(xla::Client* client) {
 int main(int argc, char** argv) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
-  auto client = xla::ClientLibrary::LocalClientOrDie();
+  auto client = xla::ClientLibrary::GetOrCreateCompileOnlyClient().ValueOrDie();
 
   xla::ComputationBuilder builder(client, "aot_test_helper");
   auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
@@ -74,7 +74,7 @@ int main(int argc, char** argv) {
   llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string));
 
   xla::Computation computation = builder.Build().ConsumeValueOrDie();
-  xla::LocalClient::AheadOfTimeComputationInstance instance{
+  xla::CompileOnlyClient::AotComputationInstance instance{
       &computation, /*argument_layouts=*/{&opaque_shape}, &r0f32};
 
   xla::cpu::CpuAotCompilationOptions options(
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 7fe4c9020f4c67ecc9888425cf0a2c358ad49e6d..7fcf687655a98d3ee972f8d3b784be655410a313 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -17,12 +17,19 @@ limitations under the License.
 
 #include <vector>
 
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -91,16 +98,34 @@ int64 TestAllocator::deallocation_count(int device_ordinal) const {
   return allocator_;
 }
 
+// Define this in .cc file to avoid having to include eigen or forward declare
+// these types in the header.
+struct LocalClientTestBase::EigenThreadPoolWrapper {
+  explicit EigenThreadPoolWrapper()
+      : pool(new tensorflow::thread::ThreadPool(
+            tensorflow::Env::Default(), "XLAEigenTest", /*num_threads=*/2)),
+        wrapper(new tensorflow::EigenThreadPoolWrapper(pool.get())),
+        device(new Eigen::ThreadPoolDevice(wrapper.get(),
+                                           wrapper->NumThreads())) {}
+
+  std::unique_ptr<tensorflow::thread::ThreadPool> pool;
+  std::unique_ptr<tensorflow::EigenThreadPoolWrapper> wrapper;
+  std::unique_ptr<Eigen::ThreadPoolDevice> device;
+};
+
 LocalClientTestBase::LocalClientTestBase(
     perftools::gputools::Platform* platform)
     : local_client_(
-          ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()) {
+          ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()),
+      thread_pool_wrapper_(new EigenThreadPoolWrapper()) {
   stream_executor_ = PlatformUtil::GetStreamExecutors(local_client_->platform())
                          .ValueOrDie()[local_client_->default_device_ordinal()];
   transfer_manager_ =
       TransferManager::GetForPlatform(local_client_->platform()).ValueOrDie();
 }
 
+LocalClientTestBase::~LocalClientTestBase() {}
+
 std::unique_ptr<ScopedShapedBuffer>
 LocalClientTestBase::LiteralToScopedShapedBuffer(const Literal& literal) {
   return LiteralToScopedShapedBuffer(literal,
@@ -190,8 +215,7 @@ ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const {
   ExecutableRunOptions run_options;
   run_options.set_inter_op_thread_pool(
       local_client_->backend().inter_op_thread_pool());
-  run_options.set_intra_op_thread_pool(
-      local_client_->backend().eigen_intra_op_thread_pool_device());
+  run_options.set_intra_op_thread_pool(thread_pool_wrapper_->device.get());
   run_options.set_allocator(GetOrCreateAllocator(local_client_->platform()));
   return run_options;
 }
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 4e7b05cea60887eec628ce9b4848321e721030e5..e3c3bb46cf26cc742b7abb39a3e457d823d829ec 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -74,8 +74,10 @@ class TestAllocator : public StreamExecutorMemoryAllocator {
 // A base class for tests which exercise the LocalClient interface.
 class LocalClientTestBase : public ::testing::Test {
  protected:
+  struct EigenThreadPoolWrapper;
   explicit LocalClientTestBase(
       perftools::gputools::Platform* platform = nullptr);
+  virtual ~LocalClientTestBase();
 
   static TestAllocator* GetOrCreateAllocator(
       perftools::gputools::Platform* platform);
@@ -142,6 +144,8 @@ class LocalClientTestBase : public ::testing::Test {
   TransferManager* transfer_manager_;
 
   LocalClient* local_client_;
+
+  std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 0cd0f97b0621d771ae039f0be6bd6c67161b49a4..5a6aa467e54f31b57d04b9c1f0cf82cd6295903d 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -55,7 +56,7 @@ void PrngTest::UniformTest(T a, T b, tensorflow::gtl::ArraySlice<int64> dims) {
 
   SetSeed(42);
   auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
-  EXPECT_TRUE(ContainersEqual(dims, actual->shape().dimensions()));
+  EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   LiteralUtil::EachCell<T>(*actual,
                            [=](tensorflow::gtl::ArraySlice<int64>, T value) {
                              EXPECT_LE(a, value);
@@ -75,7 +76,7 @@ void PrngTest::BernoulliTest(float p, tensorflow::gtl::ArraySlice<int64> dims) {
       auto actual,
       client_->ExecuteAndTransfer(computation, /*arguments=*/{},
                                   &execution_options));
-  EXPECT_TRUE(ContainersEqual(dims, actual->shape().dimensions()));
+  EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   int32 sum = 0;
   LiteralUtil::EachCell<uint32>(
       *actual, [&sum](tensorflow::gtl::ArraySlice<int64>, uint32 value) {
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index d00a3175344dffcab08116678a8c46782aa8cf64..feb2b465fca6b1ffda190025568470e8daf297a3 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -61,7 +61,7 @@ namespace {
 class ReduceTest : public ClientLibraryTestBase {
  protected:
   ReduceTest() {
-    // Implementation note: layed out z >> y >> x by default.
+    // Implementation note: laid out z >> y >> x by default.
     // clang-format off
     literal_2d_ = LiteralUtil::CreateR2<float>({
       // x0   x1   x2
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 56501e43b5c5d965ea4305f2ca88909b253ed273..c3b768579a401706eff4a2a24d840da84080d26d 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -43,7 +43,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
  public:
   ReduceWindowTest() : builder_(client_, TestName()) {}
 
-  void ReduceWindowAdd(ComputationDataHandle input,
+  void ReduceWindowAdd(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -52,7 +52,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
                           window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMax(ComputationDataHandle input,
+  void ReduceWindowMax(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -61,7 +61,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
         CreateScalarMax(), window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMin(ComputationDataHandle input,
+  void ReduceWindowMin(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -182,6 +182,7 @@ TEST_F(ReduceWindowTest, DISABLED_AmongMajor2DimsMediumSizeLargePadding) {
 
   ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
 }
+
 // TODO(b/31809540): Implement minor dim reduction to reduce num of reshapes.
 TEST_F(ReduceWindowTest, ReduceR4AmongXYMinorSmall) {
   Array4D<float> input_array(2, 2, 4, 16);
@@ -368,6 +369,16 @@ TEST_F(ReduceWindowTest, Add2x2In2x2Disjoint) {
   ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(0.0001));
 }
 
+TEST_F(ReduceWindowTest, Add1x2In2x2Same) {
+  Array2D<float> input_array({{1.0f, 2.0f}, {3.0f, 4.0f}});
+  auto input = builder_.ConstantR2FromArray2D<float>(input_array);
+  ReduceWindowAdd(input, {1, 2}, {1, 1}, Padding::kSame);
+  Array2D<float> expected({
+      {3.0f, 2.0f}, {7.0f, 4.0f},
+  });
+  ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+}
+
 XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x2) {
   Array3D<float> input_array(2, 1, 2);
   input_array(0, 0, 0) = 1000;
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 839ae42a194381396e387f0e6e8a018d6fbd5cff..c5f20b9ca1db1812f52a4d6f568ff9093016a90b 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -67,6 +67,22 @@ XLA_TEST_F(ReshapeTest, SingleElementArrayToScalar) {
   ComputeAndCompareR0<float>(&builder, 1.0f, {}, zero_error_spec_);
 }
 
+XLA_TEST_F(ReshapeTest, ScalarToSingleElementArray) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(1.0f);
+  std::unique_ptr<GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+
+  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
+  a = builder.Neg(a);
+  auto reshape =
+      builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
+
+  ComputeAndCompareR1<float>(&builder, {-1.0f}, {param0_data.get()},
+                             zero_error_spec_);
+}
+
 XLA_TEST_F(ReshapeTest, Trivial0x3) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 3));
@@ -75,6 +91,24 @@ XLA_TEST_F(ReshapeTest, Trivial0x3) {
   ComputeAndCompareR1<float>(&builder, {}, {}, zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-05-15
+// with an incorrect result rank.
+XLA_TEST_F(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> param0_literal =
+      LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
+  std::unique_ptr<GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+
+  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0, 3}), "param0");
+  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
+
+  ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
+                             zero_error_spec_);
+}
+
 XLA_TEST_F(ReshapeTest, Trivial3x0) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 1d9baf5de102752fe4b47af22ce127ba934a2579..535e5b605b4f68671c9b6a8af4a12732f88e744e 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -153,6 +153,7 @@ cc_binary(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service",
+        "//tensorflow/compiler/xla/service:computation_tracker",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 8b96e13489774539b50022808975db56c5ddc6f7..1f0ca31d6d6d57507c8639bec83d66f36cb44ab8 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,7 +35,7 @@ limitations under the License.
 namespace xla {
 namespace tools {
 
-void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
+void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
   LocalClient* client = ClientLibrary::LocalClientOrDie();
   LocalService* local_service =
       ClientLibrary::GetXlaService(client->platform());
@@ -50,23 +51,35 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     }
     Computation computation = computation_status.ConsumeValueOrDie();
 
-    std::unique_ptr<ProgramShape> program_shape =
-        client->GetComputationShape(computation).ConsumeValueOrDie();
+    if (compile) {
+      std::unique_ptr<ProgramShape> program_shape =
+          client->GetComputationShape(computation).ConsumeValueOrDie();
 
-    std::vector<const Shape*> layouts;
-    for (int i = 0; i < program_shape->parameters_size(); ++i) {
-      layouts.push_back(&program_shape->parameters(i));
-    }
-    StatusOr<std::unique_ptr<Executable>> executable =
-        local_service->CompileExecutable(
-            computation.handle(), layouts, &program_shape->result(),
-            /*device_ordinal=*/0, /*has_hybrid_result=*/true);
+      std::vector<const Shape*> layouts;
+      for (int i = 0; i < program_shape->parameters_size(); ++i) {
+        layouts.push_back(&program_shape->parameters(i));
+      }
+      StatusOr<std::unique_ptr<Executable>> executable =
+          local_service->CompileExecutable(
+              computation.handle(), layouts, &program_shape->result(),
+              /*device_ordinal=*/0, /*has_hybrid_result=*/true);
+
+      const HloModule& module = executable.ValueOrDie()->module();
 
-    const HloModule& module = executable.ValueOrDie()->module();
+      fprintf(stdout, "HLO compiled for %s backend:\n%s\n",
+              local_service->backend().platform()->Name().c_str(),
+              module.ToString().c_str());
+    } else {
+      const ComputationTracker& tracker = local_service->computation_tracker();
+      UserComputation* user_computation =
+          tracker.Resolve(computation.handle()).ConsumeValueOrDie();
+      VersionedComputationHandle versioned_handle =
+          user_computation->GetVersionedHandle();
+      std::unique_ptr<HloModule> module =
+          tracker.BuildHloModule(versioned_handle).ConsumeValueOrDie();
 
-    fprintf(stdout, "HLO for %s backend:\n%s\n",
-            local_service->backend().platform()->Name().c_str(),
-            module.ToString().c_str());
+      fprintf(stdout, "%s\n", module->ToString().c_str());
+    }
   }
 }
 
@@ -74,10 +87,21 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
 }  // namespace xla
 
 int main(int argc, char** argv) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  bool compile = false;
+  std::vector<tensorflow::Flag> flag_list = {
+      {"compile", &compile,
+       "If true, compile the computation using the default client before "
+       "dumping the HLO. Otherwise dump the raw (uncompiled) HLO."},
+  };
+  const xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  QCHECK(parsed_flags_ok) << "\n" << usage;
+
+  tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+  QCHECK(argc > 1) << "\nERROR: must specify at least one module\n" << usage;
 
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
-  xla::tools::RealMain(args);
+  xla::tools::RealMain(args, compile);
   return 0;
 }
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 8258031a2c5119d085a483a0826f7284897dcee3..8d8e66715a3626825195f875a5942e1b1db67f92 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
+#include <Eigen/Core>
+
 namespace xla {
 
 using ::tensorflow::string;
@@ -32,6 +34,8 @@ using ::tensorflow::uint16;
 using ::tensorflow::uint32;
 using ::tensorflow::uint64;
 
+using ::Eigen::half;
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TYPES_H_
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index a711b5035d842cd26945b2dac1159392813d56ab..d467178cb528a93b2c1030fc72d054cc0edf95b6 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -33,7 +33,7 @@ namespace {
 // Adds a backtrace to the provided status iff the xla_status_add_backtrace flag
 // is set. This is useful for quickly tracing status errors observed coming out
 // of the service.
-Status MaybeAddBacktrace(Status prior) {
+Status MaybeAddBacktrace(const Status& prior) {
   DCHECK(!prior.ok());
   if (legacy_flags::GetUtilFlags()->xla_status_add_backtrace) {
     return Status{prior.code(),
@@ -153,16 +153,26 @@ string Reindent(tensorflow::StringPiece original,
       });
 }
 
+bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank) {
+  if (rank != permutation.size()) {
+    return false;
+  }
+  std::vector<int64> output(permutation.size(), -1);
+  for (auto index : permutation) {
+    CHECK_GE(index, 0);
+    CHECK_LT(index, rank);
+    output[index] = 0;
+  }
+  return std::find(output.begin(), output.end(), -1) == output.end();
+}
+
 std::vector<int64> InversePermutation(
     tensorflow::gtl::ArraySlice<int64> input_permutation) {
+  DCHECK(IsPermutation(input_permutation, input_permutation.size()));
   std::vector<int64> output_permutation(input_permutation.size(), -1);
   for (size_t i = 0; i < input_permutation.size(); ++i) {
     output_permutation[input_permutation[i]] = i;
   }
-  DCHECK_EQ(
-      0, std::count(output_permutation.begin(), output_permutation.end(), -1));
-  DCHECK(std::is_permutation(input_permutation.begin(), input_permutation.end(),
-                             output_permutation.begin()));
   return output_permutation;
 }
 
@@ -196,6 +206,15 @@ PaddingConfig MakeNoPaddingConfig(int64 rank) {
   return padding_config;
 }
 
+bool HasInteriorPadding(const PaddingConfig& config) {
+  for (const auto& dim : config.dimensions()) {
+    if (dim.interior_padding() != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
 string HumanReadableNumFlops(double flops, double nanoseconds) {
   if (nanoseconds == 0) {
     return "NaN FLOP/s";
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 32b5fbba0032c04117c2109b5452e098b03e0947..42d5c1d15501fb912551a044414e6fa0c83283b8 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -39,6 +39,13 @@ limitations under the License.
 
 namespace xla {
 
+// Ranks greater than 8 are very rare, so use InlinedVector<int64, 8> to store
+// the bounds and indices. And for the rare cases of ranks greater than 8,
+// the InlinedVector will just behave like an std::vector<> and allocate the
+// memory to store its values.
+static constexpr int kInlineRank = 8;
+using DimensionVector = tensorflow::gtl::InlinedVector<int64, kInlineRank>;
+
 // RAII timer that logs with a given label the wall clock time duration in human
 // readable form. This differs from base's ElapsedTimer primarily in that it
 // spits out the human-readable duration form.
@@ -139,6 +146,18 @@ bool ContainersEqual(const Container1T& c1, const Container2T& c2,
           std::equal(std::begin(c1), std::end(c1), std::begin(c2), p));
 }
 
+// Performs a copy of count values from src to dest, using different strides for
+// source and destination. The source starting index is src_base, while the
+// destination one is dest_base.
+template <typename D, typename S>
+void StridedCopy(tensorflow::gtl::MutableArraySlice<D> dest, int64 dest_base,
+                 int64 dest_stride, tensorflow::gtl::ArraySlice<S> src,
+                 int64 src_base, int64 src_stride, int64 count) {
+  for (; count > 0; --count, dest_base += dest_stride, src_base += src_stride) {
+    dest[dest_base] = static_cast<D>(src[src_base]);
+  }
+}
+
 // Adds some context information to the error message in a
 // Status.  This is useful as Statuses are
 // propagated upwards.
@@ -165,6 +184,9 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 string Reindent(tensorflow::StringPiece original,
                 tensorflow::StringPiece indentation);
 
+// Checks whether permutation is a permutation of the [0, rank) integer range.
+bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
+
 // Applies `permutation` on `input` and returns the permuted array.
 // For each i, output[permutation[i]] = input[i].
 //
@@ -175,12 +197,11 @@ template <template <typename...> class C, typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
                        C<T> input_) {
   tensorflow::gtl::ArraySlice<T> input(input_);
-  CHECK_EQ(permutation.size(), input.size());
+  CHECK(IsPermutation(permutation, input.size()));
   std::vector<T> output(input.size());
   for (size_t i = 0; i < permutation.size(); ++i) {
     output[permutation[i]] = input[i];
   }
-  DCHECK(std::is_permutation(input.begin(), input.end(), output.begin()));
   return output;
 }
 
@@ -244,6 +265,10 @@ string VectorString(const std::initializer_list<T>& c) {
 // Returns a PaddingConfig object that represents no padding for the given rank.
 PaddingConfig MakeNoPaddingConfig(int64 rank);
 
+// Returns true if the padding configuration has at least one dimension with
+// non-zero interior padding.
+bool HasInteriorPadding(const PaddingConfig& config);
+
 // Imports the templated FloorOfRatio math function from the TensorFlow
 // namespace, as it is very commonly used.
 template <typename T>
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index e5b94fcefe3c543715aa7f87cecc2cb672ee7ac7..52189fb5d707151a8c573b62a8c55fd8cb1022b9 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -286,6 +286,7 @@ message Literal {
   repeated float f32s = 8;
   repeated double f64s = 9;
   repeated Literal tuple_literals = 10;
+  bytes f16s = 11; // Note: the F16s are encoded in little endian byte order
 }
 
 message WindowDimension {
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 7949f62d196435bc647553f48763b752315b05db..616ceb5ca17e458f7cee1b3433b3d9c45a826111 100755
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -13,6 +13,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/batching:batch_py",
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/cloud:cloud_py",
         "//tensorflow/contrib/compiler:compiler_py",
@@ -74,10 +75,11 @@ cc_library(
     name = "contrib_kernels",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/batching:batch_ops_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
-        "//tensorflow/contrib/layers:bucketization_op_kernel",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
+        "//tensorflow/contrib/nccl:nccl_kernels",
         "//tensorflow/contrib/tensor_forest:tensor_forest_kernels",
     ],
 )
@@ -86,11 +88,12 @@ cc_library(
     name = "contrib_ops_op_lib",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/batching:batch_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
-        "//tensorflow/contrib/layers:bucketization_op_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
+        "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
     ],
 )
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index ff6854426c5cec2c09e99cf9da0c8e7da9843a1c..b441906cd4dacfbbd930e05c021c87577b07aaab 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -9,23 +9,20 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
+cc_library(
+    name = "batch_scheduler_hdrs",
+    hdrs = ["batch_scheduler.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+    ],
 )
 
 cc_library(
     name = "batch_scheduler",
     hdrs = ["batch_scheduler.h"],
     deps = [
+        ":batch_scheduler_hdrs",
         "//tensorflow/core:lib",
-        "//tensorflow/core:tensorflow",
     ],
 )
 
@@ -41,14 +38,26 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "shared_batch_scheduler_hdrs",
+    hdrs = ["shared_batch_scheduler.h"],
+    deps = [
+        ":batch_scheduler_hdrs",
+        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
 cc_library(
     name = "shared_batch_scheduler",
     hdrs = ["shared_batch_scheduler.h"],
     deps = [
         ":batch_scheduler",
+        ":shared_batch_scheduler_hdrs",
         "//tensorflow/contrib/batching/util:periodic_function",
         "//tensorflow/core:lib",
     ],
+    alwayslink = 1,
 )
 
 cc_test(
@@ -102,3 +111,95 @@ cc_test(
         "//tensorflow/core:test",
     ],
 )
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "py_test",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+tf_custom_op_library(
+    name = "python/ops/_batch_ops.so",
+    srcs = [
+        "ops/batch_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/batching/kernels:batch_kernels",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["batch_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "batch_ops",
+    deps = [":batch_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "batch_ops_kernels",
+    deps = [
+        "//tensorflow/contrib/batching/kernels:batch_kernels",
+        "//tensorflow/contrib/batching/util:periodic_function",
+        "//tensorflow/core/kernels:concat_lib",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels:split_lib",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_py_library(
+    name = "batch_py",
+    srcs = glob(["python/ops/*.py"]) + ["__init__.py"],
+    dso = [":python/ops/_batch_ops.so"],
+    kernels = [
+        ":batch_ops_kernels",
+        ":batch_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batch_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "batch_ops_test",
+    size = "small",
+    srcs = ["python/ops/batch_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["nomac"],
+    deps = [
+        ":batch_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/batching/__init__.py b/tensorflow/contrib/batching/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..44fa5f42a73bfb1bf008f6f4eafd14913c88dcfa
--- /dev/null
+++ b/tensorflow/contrib/batching/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to batch.
+
+@@batch_function
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.batching.python.ops.batch_ops import batch_function
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/batching/kernels/BUILD b/tensorflow/contrib/batching/kernels/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6e53dd9a5fc0201c5ed91d1eaf07f940e341fb5e
--- /dev/null
+++ b/tensorflow/contrib/batching/kernels/BUILD
@@ -0,0 +1,34 @@
+# Description:
+#   Contains kernels for the batching ops.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "batch_kernels",
+    srcs = ["batch_kernels.cc"],
+    deps = [
+        "//tensorflow/contrib/batching:shared_batch_scheduler_hdrs",
+        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:ops_util_hdrs",
+        "//tensorflow/core/kernels:split_lib_hdrs",
+    ],
+    alwayslink = 1,
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/batching/kernels/batch_kernels.cc b/tensorflow/contrib/batching/kernels/batch_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e0957298badf9842f9617f1db1ead24d26b26ba
--- /dev/null
+++ b/tensorflow/contrib/batching/kernels/batch_kernels.cc
@@ -0,0 +1,996 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/batching/shared_batch_scheduler.h"
+#include "tensorflow/contrib/batching/util/periodic_function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/split_lib.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif  // TENSORFLOW_USE_SYCL
+
+// Concatenates 'inputs' into a single tensor along the zeroth dimension.
+// Requires that all elements of 'inputs' have element type T. Writes to the
+// op's output at position 'output_index', using 'context' for the allocation to
+// ensure proper device placement.
+template <typename T>
+Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
+              int output_index) {
+  const int input_dims = inputs[0].dims();
+  const TensorShape& input_shape = inputs[0].shape();
+
+  // Note that we reduce the concat of k-dimensional tensors into a two
+  // dimensional concat. Assuming the dimensions of any input tensor are
+  // {y0, y1,...,ym-1}, we flatten it to {1, y}, where y = Prod_i(yi).
+  std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> inputs_flat;
+  inputs_flat.reserve(inputs.size());
+  int64 output_dim0 = 0;
+  for (int i = 0; i < inputs.size(); ++i) {
+    const Tensor& input = inputs[i];
+    if (input.dims() != input_dims) {
+      return errors::InvalidArgument(
+          "Ranks of all input tensors should match: shape[0] = ",
+          input_shape.DebugString(), " vs. shape[", i,
+          "] = ", input.shape().DebugString());
+    }
+    for (int j = 1; j < input_dims; ++j) {
+      if (input.dim_size(j) != input_shape.dim_size(j)) {
+        return errors::InvalidArgument(
+            "Dimensions of inputs should match: shape[0] = ",
+            input_shape.DebugString(), " vs. shape[", i,
+            "] = ", input.shape().DebugString());
+      }
+    }
+    if (input.NumElements() > 0) {
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          input.shaped<T, 2>({1, input.NumElements()})));
+    }
+    output_dim0 += input.dim_size(0);
+  }
+
+  TensorShape output_shape(input_shape);
+  output_shape.set_dim(0, output_dim0);
+  Tensor* output = nullptr;
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(output_index, output_shape, &output));
+  if (output->NumElements() > 0) {
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+#if GOOGLE_CUDA
+    if (std::is_same<Device, GPUDevice>::value) {
+      ConcatGPU<T>(context, inputs_flat, output, &output_flat);
+      return Status::OK();
+    }
+#endif  // GOOGLE_CUDA
+    ConcatCPU<T>(context->device(), inputs_flat, &output_flat);
+  }
+
+  return Status::OK();
+}
+
+// The Split*() functions split 'input' with element type T into 'sizes.size()'
+// tensors along the zeroth dimension, with the ith split having zeroth-
+// dimension size 'sizes[i]'. They allocate the output tensors using 'context',
+// for proper device placement.
+
+// Handles special cases that are cheap. Sets 'done==true' iff it found an
+// applicable special case and wrote to the outputs. Otherwise acts as a no-op.
+template <typename T>
+Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
+                      const gtl::ArraySlice<int64>& sizes,
+                      std::vector<Tensor>* outputs, bool* done) {
+  *done = false;
+
+  int64 total_size = 0;
+  for (const int64 size : sizes) {
+    total_size += size;
+  }
+  if (total_size > input.shape().dim_size(0)) {
+    return errors::InvalidArgument(
+        "Sum of split sizes must not exceed dim0-size of input tensor");
+  }
+
+  // Special case 0: trivial 1-way split.
+  if (sizes.size() == 1 && sizes.at(0) == input.shape().dim_size(0)) {
+    outputs->push_back(input);
+    *done = true;
+    return Status::OK();
+  }
+
+  // Special case 1: input is aligned.
+  if (IsInnerDimsSizeAligned<T>(input.shape())) {
+    int64 position = 0;
+    for (const int64 size : sizes) {
+      outputs->emplace_back(input.Slice(position, position + size));
+      position += size;
+    }
+    *done = true;
+    return Status::OK();
+  }
+
+  return Status::OK();
+}
+
+// Handles the general case, on CPU.
+template <typename T>
+Status SplitCPU(OpKernelContext* context, const Tensor& input,
+                const gtl::ArraySlice<int64>& sizes,
+                std::vector<Tensor>* outputs) {
+  int64 suffix_dim_size = 1;
+  for (int i = 1; i < input.shape().dims(); ++i) {
+    suffix_dim_size *= input.shape().dim_size(i);
+  }
+  auto input_reshaped =
+      input.shaped<T, 3>({1, input.shape().dim_size(0), suffix_dim_size});
+
+  int64 position = 0;
+  for (const int64 size : sizes) {
+    TensorShape output_shape = input.shape();
+    output_shape.set_dim(0, size);
+    Tensor output;
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(input.dtype(), output_shape, &output));
+    auto output_shaped = output.shaped<T, 3>({1, size, suffix_dim_size});
+
+    Eigen::DSizes<Eigen::DenseIndex, 3> slice_indices{0, position, 0};
+    Eigen::DSizes<Eigen::DenseIndex, 3> slice_sizes{1, size, suffix_dim_size};
+    functor::Split<CPUDevice, T>()(context->eigen_device<CPUDevice>(),
+                                   output_shaped, input_reshaped, slice_indices,
+                                   slice_sizes);
+
+    outputs->emplace_back(output);
+
+    position += size;
+  }
+
+  return Status::OK();
+}
+
+#if GOOGLE_CUDA
+
+// Handles the general case, on GPU.
+template <typename T>
+Status SplitGPU(OpKernelContext* context, const Tensor& input,
+                const gtl::ArraySlice<int64>& sizes,
+                std::vector<Tensor>* outputs) {
+  // TODO(olston, apassos): Implement this.
+  LOG(FATAL) << "Not yet implemented";  // Crash ok
+}
+
+#endif  // GOOGLE_CUDA
+
+// The outer function that dispatches to the various Split*() functions above.
+template <typename T>
+Status Split(OpKernelContext* context, const Tensor& input,
+             const gtl::ArraySlice<int64>& sizes,
+             std::vector<Tensor>* outputs) {
+  bool easy_cases_done;
+  TF_RETURN_IF_ERROR(
+      SplitEasyCases<T>(context, input, sizes, outputs, &easy_cases_done));
+  if (easy_cases_done) {
+    return Status::OK();
+  }
+
+#if GOOGLE_CUDA
+// TODO(olston, apassos): Handle non-CPU cases.
+// return SplitGPU<T>(context, input, sizes, outputs);
+#endif  // GOOGLE_CUDA
+  return SplitCPU<T>(context, input, sizes, outputs);
+}
+
+// A class encapsulating the state and logic for batching tensors.
+class BatchResource : public ResourceBase {
+ public:
+  static Status Create(int32 num_batch_threads, int32 max_batch_size,
+                       int32 batch_timeout_micros,
+                       const std::vector<int32>& allowed_batch_sizes,
+                       std::unique_ptr<BatchResource>* resource) {
+    std::unique_ptr<BatchResource> new_resource(new BatchResource);
+
+    Batcher::Options batcher_options;
+    batcher_options.num_batch_threads = num_batch_threads;
+    TF_RETURN_IF_ERROR(
+        Batcher::Create(batcher_options, &new_resource->batcher_));
+
+    new_resource->batcher_queue_options_.max_batch_size = max_batch_size;
+    new_resource->batcher_queue_options_.batch_timeout_micros =
+        batch_timeout_micros;
+
+    new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
+
+    *resource = std::move(new_resource);
+    return Status::OK();
+  }
+
+  string DebugString() final { return "BatchResource"; }
+
+  // Ingests data from one invocation of the batch op. The data is enqueued to
+  // be combined with others into a batch, asynchronously.
+  Status RegisterInput(int64 guid, OpKernelContext* context,
+                       const string& batcher_queue_name,
+                       AsyncOpKernel::DoneCallback done_callback) {
+    std::unique_ptr<BatchTask> batch_components(new BatchTask);
+    batch_components->guid = guid;
+    OpInputList tensors;
+    TF_RETURN_IF_ERROR(context->input_list("in_tensors", &tensors));
+    for (int i = 0; i < tensors.size(); ++i) {
+      const Tensor& tensor = tensors[i];
+      if (tensor.shape().dims() == 0) {
+        return errors::InvalidArgument(
+            "Batching input tensors must have at least one dimension");
+      }
+      if (tensors.size() >= 2 &&
+          tensor.shape().dim_size(0) != tensors[0].shape().dim_size(0)) {
+        return errors::InvalidArgument(
+            "Batching input tensors supplied in a given op invocation must "
+            "have equal 0th-dimension size");
+      }
+      batch_components->inputs.push_back(tensor);
+    }
+    batch_components->context = context;
+    batch_components->done_callback = std::move(done_callback);
+
+    BatcherQueue* batcher_queue;
+    TF_RETURN_IF_ERROR(
+        LookupOrCreateBatcherQueue(batcher_queue_name, &batcher_queue));
+    return batcher_queue->Schedule(&batch_components);
+  }
+
+ private:
+  BatchResource() = default;
+
+  // One input to be batched. Corresponds to one invocation of the batch op.
+  struct BatchTask : public serving::BatchTask {
+    // A unique ID to identify this invocation of Batch.
+    int64 guid;
+
+    std::vector<Tensor> inputs;
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done_callback;
+
+    size_t size() const override { return inputs[0].shape().dim_size(0); }
+  };
+
+  using Batcher = serving::SharedBatchScheduler<BatchTask>;
+  using BatcherQueue = serving::BatchScheduler<BatchTask>;
+  using Batch = serving::Batch<BatchTask>;
+
+  // Validates that it's legal to combine the tasks in 'batch' into a batch.
+  // Assumes the batch is non-empty.
+  static Status ValidateBatch(const Batch& batch) {
+    for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+      const BatchTask& task = batch.task(task_idx);
+
+      if (task.inputs.size() != batch.task(0).inputs.size()) {
+        return errors::InvalidArgument(
+            "Batching inputs must have equal number of edges");
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
+  // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
+  // returns 'batch_size'.
+  int RoundToLowestAllowedBatchSize(int batch_size) const {
+    if (allowed_batch_sizes_.empty()) {
+      return batch_size;
+    }
+    for (int allowed_size : allowed_batch_sizes_) {
+      if (allowed_size >= batch_size) {
+        return allowed_size;
+      }
+    }
+    LOG(ERROR) << "Maximum batch size greater than largest allowed size; "
+                  "ignoring allowed sizes constraint";
+    return batch_size;
+  }
+
+  // Processes a batch of one or more BatchTask entries.
+  void ProcessBatch(std::unique_ptr<Batch> batch) const {
+    if (batch->empty()) {
+      return;
+    }
+    const int padded_batch_size = RoundToLowestAllowedBatchSize(batch->size());
+    const int padding_amount = padded_batch_size - batch->size();
+
+    OpKernelContext* last_task_context =
+        batch->task(batch->num_tasks() - 1).context;
+    AsyncOpKernel::DoneCallback last_task_callback =
+        batch->task(batch->num_tasks() - 1).done_callback;
+
+    OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
+                         last_task_callback);
+
+    // All tasks should have the same number of input edges.
+    const int num_input_edges = batch->task(0).inputs.size();
+
+    // Process each input edge one at a time (the typical case has just one).
+    for (int i = 0; i < num_input_edges; ++i) {
+      // Emit batch->num_tasks() - 1 empty output tensors.
+      for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+        const BatchTask& task = batch->task(task_idx);
+        TensorShape output_shape(task.inputs.at(i).shape());
+        output_shape.set_dim(0, 0);
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK_ASYNC(
+            task.context,
+            task.context->allocate_output(i, output_shape, &output),
+            task.done_callback);
+      }
+
+      // Concatenate the tasks ith input tensors into a big output tensor.
+      std::vector<Tensor> to_concatenate;
+      for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+        to_concatenate.push_back(batch->task(task_idx).inputs.at(i));
+      }
+
+      // Add padding as needed. Use the first row of the first task's tensor as
+      // the data for padding.
+      if (padding_amount > 0) {
+        const Tensor& padding_source = batch->task(0).inputs.at(i);
+        Tensor padding;
+        if (padding_source.shape().dim_size(0) == 1) {
+          padding = padding_source;
+        } else {
+          const std::vector<int64> slice_sizes = {1};
+          const DataType type = padding_source.dtype();
+          Status slice_status;
+          std::vector<Tensor> slices;
+          switch (type) {
+#define CASE(type)                                                   \
+  case DataTypeToEnum<type>::value:                                  \
+    slice_status = SplitCPU<type>(last_task_context, padding_source, \
+                                  slice_sizes, &slices);             \
+    break;
+            TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+            default:
+              slice_status =
+                  errors::InvalidArgument("Unsupported data type: ", type);
+              break;
+          }
+          OP_REQUIRES_OK_ASYNC(last_task_context, slice_status,
+                               last_task_callback);
+          padding = slices.at(0);
+        }
+        for (int i = 0; i < padding_amount; ++i) {
+          to_concatenate.push_back(padding);
+        }
+      }
+
+      const DataType type = to_concatenate[0].dtype();
+      Status concat_status;
+      switch (type) {
+#define CASE(type)                                                      \
+  case DataTypeToEnum<type>::value:                                     \
+    concat_status = Concat<type>(last_task_context, to_concatenate, i); \
+    break;
+        TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+        default:
+          concat_status =
+              errors::InvalidArgument("Unsupported data type: ", type);
+          break;
+      }
+      OP_REQUIRES_OK_ASYNC(last_task_context, concat_status,
+                           last_task_callback);
+    }
+
+    // Emit batch->num_tasks() - 1 empty index tensors.
+    for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+      const BatchTask& task = batch->task(task_idx);
+      TensorShape index_shape({0, 3});
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          task.context,
+          task.context->allocate_output(num_input_edges, index_shape, &output),
+          task.done_callback);
+    }
+    // Emit all ID tensors.
+    for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+      const BatchTask& task = batch->task(task_idx);
+      Tensor* id;
+      OP_REQUIRES_OK_ASYNC(task.context,
+                           task.context->allocate_output(num_input_edges + 1,
+                                                         TensorShape({}), &id),
+                           task.done_callback);
+      id->scalar<int64>()() = task.guid;
+    }
+    OP_REQUIRES_OK_ASYNC(
+        last_task_context,
+        EmitIndexTensor(last_task_context, *batch, num_input_edges),
+        last_task_callback);
+
+    // Signal done for each element of the batch. (At this point, the contexts
+    // are no longer guaranteed to remain live.)
+    for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+      batch->mutable_task(task_idx)->done_callback();
+    }
+  }
+
+  // Emits an index tensor, which the Unbatch op will use to un-concatenate
+  // the tensor and attribute the pieces to the right batch keys. The index
+  // tensor contains, for each input: [batch_key, start_offset, end_offset]
+  // where start_offset and end_offset represent the range of entries in the
+  // concatenated tensors that belong to that input.
+  //
+  // Emits the result to the output at 'output_index' using 'context'.
+  static Status EmitIndexTensor(OpKernelContext* context, const Batch& batch,
+                                int output_index) {
+    const TensorShape index_shape({batch.num_tasks(), 3});
+    Tensor* index = nullptr;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(output_index, index_shape, &index));
+    auto index_flat = index->shaped<int64, 2>({batch.num_tasks(), 3});
+    size_t offset = 0;
+    for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+      const BatchTask& task = batch.task(task_idx);
+      index_flat(task_idx, 0) = task.guid;
+      index_flat(task_idx, 1) = offset;
+      index_flat(task_idx, 2) = offset + task.size();
+      offset += task.size();
+    }
+    return Status::OK();
+  }
+
+  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
+  // creates it.
+  Status LookupOrCreateBatcherQueue(const string& queue_name,
+                                    BatcherQueue** queue) {
+    mutex_lock l(batcher_queues_mu_);
+
+    auto it = batcher_queues_.find(queue_name);
+    if (it != batcher_queues_.end()) {
+      *queue = it->second.get();
+      return Status::OK();
+    }
+
+    std::unique_ptr<BatcherQueue> new_queue;
+    auto process_batch_callback = [this](std::unique_ptr<Batch> batch) {
+      ProcessBatch(std::move(batch));
+    };
+    TF_RETURN_IF_ERROR(batcher_->AddQueue(batcher_queue_options_,
+                                          process_batch_callback, &new_queue));
+    *queue = new_queue.get();
+    batcher_queues_[queue_name] = std::move(new_queue);
+    return Status::OK();
+  }
+
+  // A batch scheduler, and options for creating queues.
+  std::shared_ptr<Batcher> batcher_;
+  Batcher::QueueOptions batcher_queue_options_;
+
+  // A collection of batcher queues, keyed on queue name.
+  // TODO(olston): Garbage-collect unused queues (perhaps simply remove empty
+  // ones (with a time delay?); it's okay if they get recreated later).
+  mutable mutex batcher_queues_mu_;
+  std::map<string, std::unique_ptr<BatcherQueue>> batcher_queues_
+      GUARDED_BY(batcher_queues_mu_);
+
+  std::vector<int32> allowed_batch_sizes_;
+};
+
+class BatchKernel : public AsyncOpKernel {
+ public:
+  explicit BatchKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+    OP_REQUIRES_OK(c, c->GetAttr("batching_queue", &batcher_queue_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_batch_threads", &num_batch_threads_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_batch_size", &max_batch_size_));
+    OP_REQUIRES_OK(c,
+                   c->GetAttr("batch_timeout_micros", &batch_timeout_micros_));
+    OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
+    OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    BatchResource* br;
+    std::function<Status(BatchResource * *r)> creator =
+        [this](BatchResource** r) {
+          std::unique_ptr<BatchResource> new_resource;
+          TF_RETURN_IF_ERROR(BatchResource::Create(
+              num_batch_threads_, max_batch_size_, batch_timeout_micros_,
+              allowed_batch_sizes_, &new_resource));
+          *r = new_resource.release();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &br, creator),
+                         done);
+    const Status status =
+        br->RegisterInput(random::New64(), c, batcher_queue_, done);
+    br->Unref();
+    if (!status.ok()) {
+      OP_REQUIRES_OK_ASYNC(c, status, done);
+    }
+    // Assume br calls done, so nothing to do here.
+  }
+
+  // Validates 'allowed_batch_sizes_'. The entries must increase monotonically,
+  // and the last one must equal 'max_batch_size_'.
+  Status ValidateAllowedBatchSizes() const {
+    if (allowed_batch_sizes_.empty()) {
+      return Status::OK();
+    }
+    int32 last_size = 0;
+    for (int i = 0; i < allowed_batch_sizes_.size(); ++i) {
+      const int32 size = allowed_batch_sizes_.at(i);
+      if (i > 0 && size <= last_size) {
+        return errors::InvalidArgument(
+            "allowed_batch_sizes entries must be monotonically increasing");
+      }
+      if (i == allowed_batch_sizes_.size() - 1 && size != max_batch_size_) {
+        return errors::InvalidArgument(
+            "final entry in allowed_batch_sizes must equal max_batch_size");
+      }
+      last_size = size;
+    }
+    return Status::OK();
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+  string batcher_queue_;
+  int32 num_batch_threads_;
+  int32 max_batch_size_;
+  int32 batch_timeout_micros_;
+  std::vector<int32> allowed_batch_sizes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Batch").Device(DEVICE_CPU), BatchKernel);
+
+// A class encapsulating the state and logic for unbatching tensors.
+//
+// UnbatchResource keeps two data structures indexed by batch-key: one which has
+// the continuations for all concurrent kernels which are waiting for tensors
+// and another which has tensors which are waiting for their corresponding
+// kernels to run. Whenever a kernel runs, we either grab its tensor if it's
+// waiting already, or we insert it in the queue and then look at its tensor to
+// see if it can be used to dispatch any stored continuations.
+class UnbatchResource : public ResourceBase {
+ public:
+  explicit UnbatchResource(int32 timeout_micros)
+      : timeout_micros_(timeout_micros),
+        timeout_enforcer_(new serving::PeriodicFunction(
+            [this] { EnforceTimeout(); }, 1000 /* 1 ms */)) {}
+
+  ~UnbatchResource() override {
+    // Tear down 'timeout_enforcer_' first, since it accesses other state in
+    // this class.
+    timeout_enforcer_ = nullptr;
+  }
+
+  string DebugString() final { return "UnbatchResource"; }
+
+  Status Compute(OpKernelContext* context, AsyncOpKernel::DoneCallback done) {
+    const Tensor& data_t = context->input(0);
+    const Tensor& batch_index_t = context->input(1);
+
+    if (batch_index_t.shape().dim_size(0) > data_t.shape().dim_size(0)) {
+      return errors::InvalidArgument(
+          "Wrong shape for index tensor. Expected 0th dimension size to be no "
+          "greater than ",
+          data_t.shape().dim_size(0),
+          "; Got: ", batch_index_t.shape().dim_size(0), ".");
+    }
+    if (batch_index_t.shape().dim_size(1) != 3) {
+      return errors::InvalidArgument(
+          "Wrong shape for index tensor. Expected 1st dimension size to be 3 ; "
+          "Got: ",
+          batch_index_t.shape().dim_size(1), ".");
+    }
+
+    const int64 batch_key = context->input(2).scalar<int64>()();
+    const bool nonempty_input = batch_index_t.dim_size(0) > 0;
+
+    // If we have a non-empty tensor, slice it up.
+    // (It is important to do this outside of the critical section below.)
+    // The following variables are populated iff 'nonempty_input==true'.
+    std::vector<int64> sizes;
+    std::vector<int64> batch_keys;
+    std::vector<Tensor> split_inputs;
+    if (nonempty_input) {
+      auto batch_indices =
+          batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
+      for (int i = 0; i < batch_index_t.dim_size(0); ++i) {
+        sizes.push_back(batch_indices(i, 2) - batch_indices(i, 1));
+        batch_keys.push_back(batch_indices(i, 0));
+      }
+
+      const DataType type = data_t.dtype();
+      switch (type) {
+#define CASE(type)                                                          \
+  case DataTypeToEnum<type>::value:                                         \
+    TF_RETURN_IF_ERROR(Split<type>(context, data_t, sizes, &split_inputs)); \
+    break;
+        TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+        default:
+          return errors::InvalidArgument("Unsupported data type: ", type);
+      }
+    }
+
+    // Critical section.
+    std::vector<AsyncOpKernel::DoneCallback> done_callbacks_to_call;
+    Status status = [&]() -> Status {
+      mutex_lock ml(mu_);
+
+      // Check to see whether the tensor we want is already ready.
+      auto tensor_it = waiting_tensors_.find(batch_key);
+      if (tensor_it != waiting_tensors_.end()) {
+        context->set_output(0, tensor_it->second.tensor);
+        waiting_tensors_.erase(tensor_it);
+        done_callbacks_to_call.push_back(done);
+        return Status::OK();
+      }
+
+      const uint64 deadline_micros =
+          Env::Default()->NowMicros() + timeout_micros_;
+
+      // Add ourselves to the waitlist for tensors.
+      if (!waiting_callbacks_
+               .emplace(batch_key,
+                        WaitingCallback{deadline_micros, context, done})
+               .second) {
+        return errors::AlreadyExists(
+            "Multiple session runs with the same batch key.");
+      }
+
+      // If we have a non-empty tensor, finish the waitlisted runs,
+      // and store any remaining pieces.
+      if (nonempty_input) {
+        for (int i = 0; i < batch_keys.size(); ++i) {
+          auto runs_it = waiting_callbacks_.find(batch_keys[i]);
+          if (runs_it != waiting_callbacks_.end()) {
+            runs_it->second.context->set_output(0, split_inputs[i]);
+            done_callbacks_to_call.push_back(runs_it->second.done);
+            waiting_callbacks_.erase(runs_it);
+          } else {
+            // Note: the deadline here is in case we are arriving late and the
+            // kernel that should rendezvous with this tensor has already waited
+            // and timed out.
+            if (!waiting_tensors_
+                     .emplace(batch_keys[i],
+                              WaitingTensor{deadline_micros, split_inputs[i]})
+                     .second) {
+              return errors::AlreadyExists(
+                  "Multiple tensors returned for same batch key.");
+            }
+          }
+        }
+      }
+
+      return Status::OK();
+    }();
+
+    for (const AsyncOpKernel::DoneCallback& done_callback :
+         done_callbacks_to_call) {
+      done_callback();
+    }
+
+    return status;
+  }
+
+ private:
+  // Evicts waiting tensors and callbacks that have exceeded their deadline.
+  void EnforceTimeout() {
+    const uint64 now = Env::Default()->NowMicros();
+    std::vector<WaitingCallback> evicted_callbacks;
+
+    {
+      mutex_lock ml(mu_);
+
+      for (auto it = waiting_tensors_.begin(); it != waiting_tensors_.end();) {
+        const WaitingTensor& waiting_tensor = it->second;
+        if (waiting_tensor.deadline_micros < now) {
+          it = waiting_tensors_.erase(it);
+        } else {
+          ++it;
+        }
+      }
+
+      for (auto it = waiting_callbacks_.begin();
+           it != waiting_callbacks_.end();) {
+        const WaitingCallback& waiting_callback = it->second;
+        if (waiting_callback.deadline_micros < now) {
+          evicted_callbacks.push_back(waiting_callback);
+          it = waiting_callbacks_.erase(it);
+        } else {
+          ++it;
+        }
+      }
+    }
+
+    for (const WaitingCallback& evicted_callback : evicted_callbacks) {
+      evicted_callback.context->CtxFailureWithWarning(errors::DeadlineExceeded(
+          "Batched data did not arrive within timeout window."));
+      evicted_callback.done();
+    }
+  }
+
+  struct WaitingTensor {
+    uint64 deadline_micros;
+    Tensor tensor;
+  };
+
+  struct WaitingCallback {
+    uint64 deadline_micros;
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done;
+  };
+
+  const int32 timeout_micros_;
+
+  mutex mu_;
+
+  // Maps keyed by BatchKey of tensors waiting for callbacks and callbacks
+  // waiting for tensors.
+  std::unordered_map<int64, WaitingTensor> waiting_tensors_ GUARDED_BY(mu_);
+  std::unordered_map<int64, WaitingCallback> waiting_callbacks_ GUARDED_BY(mu_);
+
+  // A thread that evicts waiting tensors and callbacks that have exceeded their
+  // deadline.
+  std::unique_ptr<serving::PeriodicFunction> timeout_enforcer_;
+};
+
+class UnbatchKernel : public AsyncOpKernel {
+ public:
+  explicit UnbatchKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+    OP_REQUIRES_OK(c, c->GetAttr("timeout_micros", &timeout_micros_));
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    UnbatchResource* ubr;
+    std::function<Status(UnbatchResource * *r)> creator =
+        [this](UnbatchResource** r) {
+          *r = new UnbatchResource(timeout_micros_);
+          return Status::OK();
+        };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &ubr, creator),
+                         done);
+    auto status = ubr->Compute(c, done);
+    ubr->Unref();
+    if (!status.ok()) {
+      OP_REQUIRES_OK_ASYNC(c, status, done);
+    }
+    // Assume ubr calls done, so nothing to do here.
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+  int32 timeout_micros_;
+};
+REGISTER_KERNEL_BUILDER(Name("Unbatch").Device(DEVICE_CPU), UnbatchKernel);
+
+// A class encapsulating the state and logic for batching tensors
+// deterministically for the gradient of unbatch.
+class UnbatchGradResource : public ResourceBase {
+ public:
+  UnbatchGradResource() {}
+
+  string DebugString() final { return "UnbatchGradResource"; }
+
+  // Flushes the information for one batch, given its context and done
+  // callback. Clears all information about it from the available_tensors_.
+  Status OutputBatch(OpKernelContext* context,
+                     const AsyncOpKernel::DoneCallback& done)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    const Tensor& batch_index_t = context->input(1);
+    auto batch_index =
+        batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
+    std::vector<Tensor> tensors;
+    for (int i = 0; i < batch_index_t.dim_size(0); ++i) {
+      auto available_it = available_tensors_.find(batch_index(i, 0));
+      if (available_it == available_tensors_.end()) {
+        return errors::Internal("bad bookkeeping of available tensors.");
+      }
+      tensors.push_back(available_it->second);
+      available_tensors_.erase(available_it);
+    }
+
+    const DataType type = tensors[0].dtype();
+    switch (type) {
+#define CASE(type)                                         \
+  case DataTypeToEnum<type>::value:                        \
+    TF_RETURN_IF_ERROR(Concat<type>(context, tensors, 0)); \
+    break;
+      TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+      default:
+        return errors::InvalidArgument("Unsupported data type: ", type);
+    }
+    done();
+    return Status::OK();
+  }
+
+  // Ingests data from one invocation of the op.
+  Status Compute(OpKernelContext* context,
+                 const AsyncOpKernel::DoneCallback& done) {
+    const Tensor& data_t = context->input(0);
+    const Tensor& batch_index_t = context->input(1);
+    const Tensor& grad_t = context->input(2);
+
+    mutex_lock ml(mu_);
+
+    const int64 batch_key = context->input(3).scalar<int64>()();
+    // Mark our tensor as available.
+    if (!available_tensors_.emplace(batch_key, grad_t).second) {
+      return errors::InvalidArgument("Two runs with the same batch key.");
+    }
+
+    // Check whether we have a valid input tensor and, if so, create its
+    // dispatch logic.
+    if (data_t.NumElements() > 0) {
+      if (batch_index_t.NumElements() == 0) {
+        return errors::InvalidArgument(
+            "batch_index is empty while the tensor isn't.");
+      }
+      std::unordered_set<int64> missing_tensors;
+      const auto batch_index =
+          batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
+      for (int i = 0; i < batch_index_t.dim_size(0); ++i) {
+        const int64 batch_key = batch_index(i, 0);
+        if (available_tensors_.find(batch_key) == available_tensors_.end()) {
+          missing_tensors.emplace(batch_key);
+        }
+      }
+      if (missing_tensors.empty()) {
+        return OutputBatch(context, done);
+      }
+      if (!available_batches_
+               .emplace(batch_key, Batch{missing_tensors, context, done})
+               .second) {
+        return errors::InvalidArgument(
+            "Batch key with valid batch used twice.");
+      }
+      for (const int64 i : missing_tensors) {
+        if (!desired_tensor_to_batch_map_.emplace(i, batch_key).second) {
+          return errors::InvalidArgument(
+              "Missing tensor wanted by more than one batch.");
+        }
+      }
+    } else {
+      // If we don't have a valid input tensor we can output an empty tensor and
+      // call our done closure.
+      TensorShape output_shape(grad_t.shape());
+      output_shape.set_dim(0, 0);
+      Tensor* output = nullptr;
+      TF_RETURN_IF_ERROR(context->allocate_output(0, output_shape, &output));
+      done();
+    }
+
+    // Search to see whether our tensor is desired by any existing batch.
+    auto desire_it = desired_tensor_to_batch_map_.find(batch_key);
+    if (desire_it != desired_tensor_to_batch_map_.end()) {
+      // Mark our tensor as no longer missing.
+      auto batch_it = available_batches_.find(desire_it->second);
+      desired_tensor_to_batch_map_.erase(desire_it);
+      if (batch_it == available_batches_.end()) {
+        return errors::InvalidArgument("Batch no longer exists.");
+      }
+      batch_it->second.missing_tensors.erase(batch_key);
+      // If all tensors are available we should concatenate them and dispatch
+      // the batch.
+      if (batch_it->second.missing_tensors.empty()) {
+        TF_RETURN_IF_ERROR(
+            OutputBatch(batch_it->second.context, batch_it->second.done));
+        available_batches_.erase(batch_it);
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  mutex mu_;
+
+  // Represents a still-incomplete batch of tensors. When all tensors become
+  // available they will be concatenated in the right order and sent through the
+  // context.
+  struct Batch {
+    // Batch keys for tensors which are still missing from this batch. When this
+    // is empty the Tensors can be concatenated and forwarded.
+    std::unordered_set<int64> missing_tensors;
+
+    // Context and callback for the session responsible for finishing this
+    // batch.
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done;
+  };
+
+  // Map from batch key of the session which will output the batched gradients
+  // to still-incomplete batches.
+  std::unordered_map<int64, Batch> available_batches_;
+
+  // Map from batch key to tensors which are waiting for their batches to be
+  // available.
+  std::unordered_map<int64, Tensor> available_tensors_;
+
+  // Map from batch key of a tensor which is not yet available to the batch key
+  // of the batch to which it belongs.
+  std::unordered_map<int64, int64> desired_tensor_to_batch_map_;
+};
+
+class UnbatchGradKernel : public AsyncOpKernel {
+ public:
+  explicit UnbatchGradKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    UnbatchGradResource* ubr;
+    std::function<Status(UnbatchGradResource * *r)> creator =
+        [this](UnbatchGradResource** r) {
+          *r = new UnbatchGradResource();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &ubr, creator),
+                         done);
+    Status status = ubr->Compute(c, done);
+    ubr->Unref();
+    if (!status.ok()) {
+      OP_REQUIRES_OK_ASYNC(c, status, done);
+    }
+    // Assume ubr calls done, so nothing to do here.
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+};
+REGISTER_KERNEL_BUILDER(Name("UnbatchGrad").Device(DEVICE_CPU),
+                        UnbatchGradKernel);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/ops/batch_ops.cc b/tensorflow/contrib/batching/ops/batch_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85e0ccba4aa372bdc21fb194263569b8b787bb6c
--- /dev/null
+++ b/tensorflow/contrib/batching/ops/batch_ops.cc
@@ -0,0 +1,164 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("Batch")
+    .Input("in_tensors: T")
+    .Output("batched_tensors: T")
+    .Output("batch_index: int64")
+    .Output("id: int64")
+    .Attr("num_batch_threads: int")
+    .Attr("max_batch_size: int")
+    .Attr("batch_timeout_micros: int")
+    .Attr("allowed_batch_sizes: list(int) = []")
+    .Attr("grad_timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("batching_queue: string = ''")
+    .Attr("T: list(type)")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<shape_inference::ShapeHandle> in_shapes;
+      TF_RETURN_IF_ERROR(c->input("in_tensors", &in_shapes));
+      std::vector<shape_inference::ShapeHandle> out_shapes(in_shapes.size());
+      for (int i = 0; i < in_shapes.size(); ++i) {
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(in_shapes[i], 0, c->UnknownDim(), &out_shapes[i]));
+      }
+      TF_RETURN_IF_ERROR(c->set_output("batched_tensors", out_shapes));
+      TF_RETURN_IF_ERROR(c->set_output("id", {c->Scalar()}));
+      TF_RETURN_IF_ERROR(c->set_output(
+          "batch_index",
+          {c->MakeShape({shape_inference::DimensionOrConstant(c->UnknownDim()),
+                         shape_inference::DimensionOrConstant(3)})}));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Batches all input tensors nondeterministically.
+
+When many instances of this Op are being run concurrently with the same
+container/shared_name in the same device, some will output zero-shaped Tensors
+and others will output Tensors of size up to max_batch_size.
+
+All Tensors in in_tensors are batched together (so, for example, labels and
+features should be batched with a single instance of this operation.
+
+Each invocation of batch emits an `id` scalar which will be used to identify
+this particular invocation when doing unbatch or its gradient.
+
+Each op which emits a non-empty batch will also emit a non-empty batch_index
+Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+start, and length of elements of each set of Tensors present in batched_tensors.
+
+Batched tensors are concatenated along the first dimension, and all tensors in
+in_tensors must have the first dimension of the same size.
+
+in_tensors: The tensors to be batched.
+num_batch_threads: Number of scheduling threads for processing batches of work.
+ Determines the number of batches processed in parallel.
+max_batch_size: Batch sizes will never be bigger than this.
+batch_timeout_micros: Maximum number of microseconds to wait before outputting
+ an incomplete batch.
+allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+ nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+ batches up to one of those sizes. The entries must increase monotonically, and
+ the final entry must equal max_batch_size.
+grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+batch_index: If out_tensors is non-empty, has information to invert it.
+container: Controls the scope of sharing of this batch.
+id: always contains a scalar with a unique ID for this invocation of Batch.
+shared_name: Concurrently running instances of batch in the same device with the
+ same container and shared_name will batch their elements together. If left
+ empty, the op name will be used as the shared name.
+T: the types of tensors to be batched.
+)doc");
+
+REGISTER_OP("Unbatch")
+    .Input("batched_tensor: T")
+    .Input("batch_index: int64")
+    .Input("id: int64")
+    .Output("unbatched_tensor: T")
+    .Attr("timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle out_shape;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &out_shape));
+      c->set_output(0, out_shape);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Reverses the operation of Batch for a single output Tensor.
+
+An instance of Unbatch either receives an empty batched_tensor, in which case it
+asynchronously waits until the values become available from a concurrently
+running instance of Unbatch with the same container and shared_name, or receives
+a non-empty batched_tensor in which case it finalizes all other concurrently
+running instances and outputs its own element from the batch.
+
+batched_tensor: The possibly transformed output of Batch. The size of the first
+ dimension should remain unchanged by the transformations for the operation to
+ work.
+batch_index: The matching batch_index obtained from Batch.
+id: The id scalar emitted by Batch.
+unbatched_tensor: The Tensor corresponding to this execution.
+timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+ batched input tensor associated with a given invocation of the op.
+container: Container to control resource sharing.
+shared_name: Instances of Unbatch with the same container and shared_name are
+ assumed to possibly belong to the same batch. If left empty, the op name will
+ be used as the shared name.
+)doc");
+
+REGISTER_OP("UnbatchGrad")
+    .Input("original_input: T")
+    .Input("batch_index: int64")
+    .Input("grad: T")
+    .Input("id: int64")
+    .Output("batched_grad: T")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(2))));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Gradient of Unbatch.
+
+Acts like Batch but using the given batch_index index of batching things as they
+become available. This ensures that the gradients are propagated back in the
+same session which did the forward pass.
+
+original_input: The input to the Unbatch operation this is the gradient of.
+batch_index: The batch_index given to the Unbatch operation this is the gradient
+of.
+grad: The downstream gradient.
+id: The id scalar emitted by Batch.
+batched_grad: The return value, either an empty tensor or the batched gradient.
+container: Container to control resource sharing.
+shared_name: Instances of UnbatchGrad with the same container and shared_name
+ are assumed to possibly belong to the same batch. If left empty, the op name
+ will be used as the shared name.
+  )doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..867ee6dfbc8ecad5f0a057ec8b9ac7a3656a23a8
--- /dev/null
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Operations for automatic batching and unbatching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.batching.ops import gen_batch_ops
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.batching.ops.gen_batch_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
+
+
+_batch_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_batch_ops.so"))
+
+
+@ops.RegisterGradient("Batch")
+def _BatchGrad(op, *out_grads):  # pylint: disable=invalid-name
+  """Gradient for batch op."""
+  gradients = []
+  for i in range(len(op.inputs)):
+    gradients.append(
+        gen_batch_ops.unbatch(
+            out_grads[i],
+            op.outputs[-2],
+            op.outputs[-1],
+            timeout_micros=op.get_attr("grad_timeout_micros"),
+            shared_name="batch_gradient_{}_{}".format(op.name, i)))
+  return gradients
+
+
+@ops.RegisterGradient("Unbatch")
+def _UnbatchGrad(op, grad):   # pylint: disable=invalid-name
+  return [
+      gen_batch_ops.unbatch_grad(
+          op.inputs[0],
+          op.inputs[1],
+          grad,
+          op.inputs[2],
+          shared_name="unbatch_gradient_{}".format(op.name)), None, None
+  ]
+
+
+def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
+                   allowed_batch_sizes=None,
+                   grad_timeout_micros=60 * 1000 * 1000,
+                   unbatch_timeout_micros=60 * 1000 * 1000):
+  """Batches the computation done by the decorated function.
+
+  So, for example, in the following code
+
+  ```
+  @batch_function(1, 2, 3)
+  def layer(a):
+    return tf.matmul(a, a)
+
+  b = layer(w)
+  ```
+
+  if more than one session.run call is simultaneously trying to compute `b`
+  the values of `w` will be gathered, non-deterministically concatenated
+  along the first axis, and only one thread will run the computation. See the
+  documentation of the `Batch` op for more details.
+
+  Assumes that all arguments of the decorated function are Tensors which will
+  be batched along their first dimension.
+
+  SparseTensor is not supported. The return value of the decorated function
+  must be a Tensor or a list/tuple of Tensors.
+
+  Args:
+    num_batch_threads: Number of scheduling threads for processing batches
+     of work. Determines the number of batches processed in parallel.
+    max_batch_size: Batch sizes will never be bigger than this.
+    batch_timeout_micros: Maximum number of microseconds to wait before
+     outputting an incomplete batch.
+    allowed_batch_sizes: Optional list of allowed batch sizes. If left empty,
+     does nothing. Otherwise, supplies a list of batch sizes, causing the op
+     to pad batches up to one of those sizes. The entries must increase
+     monotonically, and the final entry must equal max_batch_size.
+    grad_timeout_micros: The timeout to use for the gradient. See the
+     documentation of the unbatch op for more details. Defaults to 60s.
+    unbatch_timeout_micros: The timeout to use for unbatching. See the
+     documentation of the unbatch op for more details. Defaults to 60s.
+
+  Returns:
+    The decorated function will return the unbatched computation output Tensors.
+  """
+  def decorator(f):  # pylint: disable=missing-docstring
+    def decorated(*args):
+      with ops.name_scope("batch") as name:
+        for a in args:
+          if not isinstance(a, ops.Tensor):
+            raise ValueError("All arguments to functions decorated with "
+                             "`batch_function`  are supposed to be Tensors; "
+                             "found %s" % repr(a))
+        batched_tensors, batch_index, id_t = gen_batch_ops.batch(
+            args,
+            num_batch_threads=num_batch_threads,
+            max_batch_size=max_batch_size,
+            batch_timeout_micros=batch_timeout_micros,
+            allowed_batch_sizes=allowed_batch_sizes,
+            grad_timeout_micros=grad_timeout_micros,
+            shared_name=name)
+        outputs = f(*batched_tensors)
+        if isinstance(outputs, ops.Tensor):
+          outputs_list = [outputs]
+        else:
+          outputs_list = outputs
+        with ops.name_scope("unbatch") as unbatch_name:
+          unbatched = [
+              gen_batch_ops.unbatch(t, batch_index, id_t,
+                                    timeout_micros=unbatch_timeout_micros,
+                                    shared_name=unbatch_name)
+              for t in outputs_list]
+        if isinstance(outputs, ops.Tensor):
+          return unbatched[0]
+        return unbatched
+    return decorated
+  return decorator
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fac7aff29f79fa18fa5f7e596db8afedabaa8993
--- /dev/null
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -0,0 +1,276 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for the currently experimental in-graph batch ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+import time
+
+from tensorflow.contrib.batching.python.ops import batch_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
+
+
+def delayed_plus1(x):
+  """Sleeps for 100ms then returns x+1."""
+  time.sleep(0.1)
+  return x + 1
+
+
+class BatchOpsTest(test.TestCase):
+  """Tests for batch_ops.{un,}batch."""
+
+  def testBasicBatch(self):
+    """Tests that a single batched tensor executes together and only once."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, _ = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=36000000, grad_timeout_micros=0,
+          batching_queue="")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(
+            sess.run([batched, index], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([batched, index], feed_dict={inp: [2]})
+      worker_thread.join()
+
+      # At this point either the thread or the main did the batch and the other
+      # should have empty results.
+      if list(thread_results[0][0]):
+        batch_t = thread_results[0][0]
+        index_t = thread_results[1]
+        empty_b = main_results[0][0]
+        empty_m = main_results[1]
+      else:
+        batch_t = main_results[0][0]
+        index_t = main_results[1]
+        empty_b = thread_results[0][0]
+        empty_m = thread_results[1]
+
+      # Check that both the inputs made it out exactly once.
+      self.assertAllEqual(sorted(batch_t), (1, 2))
+      # Check that we get 2 rows in the index tensor.
+      self.assertEqual(len(index_t), 2)
+      # Check that the other ones are empty.
+      self.assertEqual(len(empty_b), 0)
+      self.assertEqual(len(empty_m), 0)
+
+  def testBatchWithPadding(self):
+    """Test that batching with padding up to an allowed batch size works."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[2])
+      batched, index, _ = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[5, 10],
+          grad_timeout_micros=0, batching_queue="")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(
+            sess.run([batched, index], feed_dict={inp: [1, 3]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([batched, index], feed_dict={inp: [2, 4]})
+      worker_thread.join()
+
+      # At this point either the thread or the main did the batch and the other
+      # should have empty results.
+      if list(thread_results[0][0]):
+        batch_t = thread_results[0][0]
+      else:
+        batch_t = main_results[0][0]
+
+      # Check that the batch tensor incorporates the padding.
+      self.assertEqual(len(batch_t), 5)
+
+  def testMultipleBatch(self):
+    """Tests that multiple batched tensors execute together."""
+    with self.test_session() as sess:
+      inp0 = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      inp1 = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, _, _ = batch_ops.batch(
+          [inp0, inp1],
+          num_batch_threads=1,
+          max_batch_size=2,
+          batch_timeout_micros=36000000,
+          grad_timeout_micros=0,
+          batching_queue="")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(
+            sess.run([batched], feed_dict={inp0: [1],
+                                           inp1: [2]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([batched], feed_dict={inp0: [2], inp1: [3]})
+      worker_thread.join()
+
+      # At this point either the thread or the main did the batch and the other
+      # should have empty results.
+      if list(thread_results[0][0]):
+        batch_t = thread_results[0]
+        empty_t = main_results[0]
+      else:
+        batch_t = main_results[0]
+        empty_t = thread_results[0]
+
+      # Assert that the tensors were batched together.
+      self.assertAllEqual(sorted(batch_t[0]), [1, 2])
+      self.assertAllEqual(sorted(batch_t[1]), [2, 3])
+      self.assertAllEqual(empty_t[0], [])
+      self.assertAllEqual(empty_t[1], [])
+
+  def testIllegalBatchDifferentDim0Sizes(self):
+    """Tests illegally feeding tensors with different dim0 sizes."""
+    with self.test_session() as sess:
+      inp0 = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      inp1 = array_ops.placeholder(dtype=dtypes.int32, shape=[2])
+      batched, index, _ = batch_ops.batch(
+          [inp0, inp1], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=0, grad_timeout_micros=0, batching_queue="")
+      with self.assertRaises(Exception) as raised:
+        _ = sess.run([batched, index], feed_dict={inp0: [0], inp1: [1, 2]})
+      self.assertGreater(
+          raised.exception.message.find("must have equal 0th-dimension size"),
+          0)
+
+  def testBasicUnbatch(self):
+    """Tests that batch and unbatch work together."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, id_t = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[3, 10],
+          grad_timeout_micros=0, batching_queue="")
+      computation = batched[0] + 1
+      result = batch_ops.unbatch(computation, index, id_t,
+                                 timeout_micros=1000000, shared_name="unbatch")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testBasicUnbatchDecorated(self):
+    """Tests that the batch_function decorator works."""
+    with self.test_session() as sess:
+      @batch_ops.batch_function(1, 10, 100000)
+      def computation(in_t):
+        return in_t + 1
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      result = computation(inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testUnbatchTimeout(self):
+    """Tests that the unbatch timeout works."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, id_t = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=36000000, grad_timeout_micros=0,
+          batching_queue="")
+      computation = batched[0] + 1
+      timeout_micros = 10
+      result = batch_ops.unbatch(computation, index, id_t, timeout_micros,
+                                 shared_name="shared_unbatch")
+      # Set up a parallel pipeline that delays the computation, but uses the
+      # same unbatch resource object as the non-delayed pipeline.
+      computation_delayed = script_ops.py_func(delayed_plus1,
+                                               [batched[0]],
+                                               dtypes.int32)
+      result_delayed = batch_ops.unbatch(computation_delayed,
+                                         index,
+                                         id_t,
+                                         timeout_micros,
+                                         shared_name="shared_unbatch")
+
+      thread_results = []
+      def worker():
+        # A first call using the non-delayed pipeline. The batcher will send an
+        # empty tensor along the non-delayed pipeline.
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      time.sleep(0.1)  # Ensure the thread's call starts first.
+      # A second call using the delayed pipeline.  The batcher will send the
+      # batched tensor along the delayed pipeline, thus delaying the arrival of
+      # the batched tensor at the unbatch op, relative to the empty tensor.
+      #
+      # TODO(olston, apassos): Avoid relying on the order in which the batch op
+      # emits the empty tensor versus the batched one.
+      _ = sess.run([result_delayed], feed_dict={inp: [2]})
+      worker_thread.join()
+      # The thread's call should hit the timeout, and thus get 0 results.
+      self.assertEqual(len(thread_results), 0)
+
+  def testUnbatchGrad(self):
+    """Tests that batch and unbatch are differentiable."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, id_t = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=36000000, grad_timeout_micros=1000000,
+          batching_queue="")
+      computation = batched[0] * batched[0]
+      result = batch_ops.unbatch(computation, index, id_t,
+                                 timeout_micros=1000000, shared_name="unbatch")
+      grad = gradients_impl.gradients(result, inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([grad], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([grad], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [4])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
index fe1660ffd1ced7bd74249c43fac61ea65949094d..e69d6ecd8fa1eddb557472c317fce206f7c490aa 100644
--- a/tensorflow/contrib/batching/util/BUILD
+++ b/tensorflow/contrib/batching/util/BUILD
@@ -22,11 +22,21 @@ filegroup(
 )
 
 cc_library(
-    name = "periodic_function",
+    name = "periodic_function_dynamic",
     srcs = ["periodic_function.cc"],
     hdrs = ["periodic_function.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "periodic_function",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":periodic_function_dynamic",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
index d98d4e737c32dd1d172fc9fef92786b717924d60..6cdaa3187054daa278dc7342626b089f9655457b 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
@@ -95,7 +95,7 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.sample,
           seed=42)
-      actual_kl = distributions.kl(q, p)
+      actual_kl = distributions.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
@@ -123,7 +123,7 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.analytic_entropy,
           seed=42)
-      actual_kl = distributions.kl(q, p)
+      actual_kl = distributions.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
index 859f0e4b17cf09a9168118a9c897e6cc33c4455a..6d0cff4678972719cb5c565bc409041e298beadb 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
@@ -22,11 +22,11 @@ import numpy as np
 
 from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
 from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl
-from tensorflow.contrib.distributions.python.ops import normal
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.platform import test
 
 sge = stochastic_gradient_estimators
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
index a46d755897e61ec77bd0af1d94c8504d200c49e3..fff6b74b2efed27abd7b25cbe0e8e8b3904767e1 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
@@ -22,12 +22,12 @@ from tensorflow.contrib import distributions as distributions_lib
 from tensorflow.contrib import layers
 from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
 from tensorflow.contrib.bayesflow.python.ops import variational_inference_impl
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.platform import test
 
 st = stochastic_tensor
@@ -68,7 +68,7 @@ class VariationalInferenceTest(test.TestCase):
   def testDefaultVariationalAndPrior(self):
     _, prior, variational, _, log_likelihood = mini_vae()
     elbo = vi.elbo(log_likelihood)
-    expected_elbo = log_likelihood - kullback_leibler.kl(
+    expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
         variational.distribution, prior)
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
@@ -80,7 +80,7 @@ class VariationalInferenceTest(test.TestCase):
       prior = normal.Normal(loc=3., scale=2.)
       elbo = vi.elbo(
           log_likelihood, variational_with_prior={variational: prior})
-      expected_elbo = log_likelihood - kullback_leibler.kl(
+      expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
           variational.distribution, prior)
       sess.run(variables.global_variables_initializer())
       self.assertAllEqual(*sess.run([expected_elbo, elbo]))
@@ -121,7 +121,7 @@ class VariationalInferenceTest(test.TestCase):
 
     # No analytic KL available between prior and variational distributions.
     with self.assertRaisesRegexp(NotImplementedError, "No KL"):
-      distributions.kl(variational.distribution, prior)
+      distributions.kl_divergence(variational.distribution, prior)
 
     elbo = vi.elbo(
         variational_with_prior={variational: prior},
diff --git a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
index ef9fb730258ac52ae6b36554939f3490421ce0c5..f155de5032be8fc4477e0c71ca634a32c0d922d1 100644
--- a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
@@ -84,8 +84,9 @@ def elbo_ratio(log_p,
   KL[q || p] = E[ Log[q(Z)] - Log[p(Z)] ]
   ```
 
-  Note that if `p` is a `Distribution`, then `distributions.kl(q, p)` may be
-  defined and available as an exact result.
+  Note that if `p` is a `Distribution`, then
+  `distributions.kl_divergence(q, p)` may be defined and available as an
+  exact result.
 
   #### ELBO
 
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
index b810ad3093e6b8fc19496dab37c15da280f2fe62..ce5fdd98c69ca6b3482bfafa8859accdf8a78749 100644
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
@@ -48,9 +48,9 @@ import threading
 import six
 
 from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators as sge
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import distribution
 
 STOCHASTIC_TENSOR_COLLECTION = "_stochastic_tensor_collection_"
 
diff --git a/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py b/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
index 6a8577234f2a47022cb2aa0fb2f44870c5c6f6db..8d932a7c340e21da012d4ab93883735b13e01175 100644
--- a/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
@@ -28,10 +28,10 @@ from __future__ import print_function
 
 from tensorflow.contrib.bayesflow.python.ops import stochastic_graph_impl as sg
 from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl as st
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import tf_logging as logging
 
 VI_PRIORS = "__vi_priors__"
@@ -259,7 +259,7 @@ def _elbo(form, log_likelihood, log_joint, variational_with_prior,
     kl = None
     if log_joint is None and form in {ELBOForms.default, ELBOForms.analytic_kl}:
       try:
-        kl = kullback_leibler.kl(q, p)
+        kl = kullback_leibler.kl_divergence(q, p)
         logging.info("Using analytic KL between q:%s, p:%s", q, p)
       except NotImplementedError as e:
         if form == ELBOForms.analytic_kl:
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index daf0e480003c14d713e068fae0faf44b7b58fb87..fd577ad712f228fa8016a48942511a3263aae5da 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -17,6 +17,7 @@
 
 #include <memory>
 #include <vector>
+#include <cmath>
 
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h"
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc
index 0bdfb4066411cde56ed5462f6d3458461e1ea904..8de154483e6dc7df8dd4402c1d596f93c9509c16 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc
@@ -75,7 +75,7 @@ TEST_F(WeightedQuantilesSummaryTest, BuildFromBuffer) {
   Summary summary;
   summary.BuildFromBufferEntries(buffer1_->GenerateEntryList());
 
-  // We expect no approximation error because no compress operation occured.
+  // We expect no approximation error because no compress operation occurred.
   EXPECT_EQ(summary.ApproximationError(), 0);
 
   // Check first and last elements in the summary.
diff --git a/tensorflow/contrib/boosted_trees/proto/tree_config.proto b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
index 62856b6829d44157ca7c254fcad6132a51e16b1e..a78996605378dd726927f12d9f59e629f84a9f0e 100644
--- a/tensorflow/contrib/boosted_trees/proto/tree_config.proto
+++ b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
@@ -117,9 +117,21 @@ message DecisionTreeMetadata {
   bool is_finalized = 3;
 }
 
+message GrowingMetadata {
+  // Number of trees that we have attempted to build. After pruning, these
+  // trees might have been removed.
+  int64 num_trees_attempted = 1;
+  // Number of layers that we have attempted to build. After pruning, these
+  // layers might have been removed.
+  int64 num_layers_attempted = 2;
+}
+
 // DecisionTreeEnsembleConfig describes an ensemble of decision trees.
 message DecisionTreeEnsembleConfig {
   repeated DecisionTreeConfig trees = 1;
   repeated float tree_weights = 2;
   repeated DecisionTreeMetadata tree_metadata = 3;
+
+  // Metadata that is used during the training.
+  GrowingMetadata growing_metadata = 4;
 }
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index af7b4fb3868008f971233dfa950c58c0bba77872..10ac57701841b468d66d00ca82951dfc0893f15c 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -61,15 +61,18 @@ add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
   add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
-  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
+  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
   add_definitions(-DTF_COMPILE_LIBRARY)
-  add_definitions(-DNDEBUG /O2)  # Equivalent of -c opt in Bazel.
   add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
   # Suppress warnings to reduce build log size.
   add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
   add_definitions(/wd4099 /wd4146 /wd4267 /wd4305 /wd4307)
   add_definitions(/wd4715 /wd4722 /wd4723 /wd4838 /wd4309 /wd4334)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob0")
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
+  set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
 endif()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@@ -229,6 +232,12 @@ if (tensorflow_ENABLE_GPU)
   endif()
 endif()
 
+# Find python executable
+include(FindPythonInterp)
+if(NOT ${PYTHONINTERP_FOUND})
+    message(FATAL_ERROR "CMake was unable to find a python interpreter.")
+endif()
+
 # Let's get to work!
 include(tf_core_framework.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
diff --git a/tensorflow/contrib/cmake/external/googletest.cmake b/tensorflow/contrib/cmake/external/googletest.cmake
index c370f46d2a03c5aba12c58670a2f699f21abe351..d09bb02890f25a0312e62c876c1729e57a059e82 100644
--- a/tensorflow/contrib/cmake/external/googletest.cmake
+++ b/tensorflow/contrib/cmake/external/googletest.cmake
@@ -21,7 +21,7 @@ set(googletest_TAG ec44c6c1675c25b9827aacd08c02433cccde7780)
 
 if(WIN32)
   set(googletest_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.lib)
+      ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/$(Configuration)/gtest.lib)
 else()
   set(googletest_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.a)
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 46ea519f09de2e8c0dd69f97efaab18183370f19..d7201680ceb9984598bc45df01d6195e4e1ca897 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -21,9 +21,9 @@ set(GRPC_TAG 3bc78cd0b5bd784a235c01612d634b1ec5f8fb97)
 
 if(WIN32)
   set(grpc_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/grpc++_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/grpc_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/gpr.lib)
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc++_unsecure.lib
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc_unsecure.lib
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/gpr.lib)
 else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
diff --git a/tensorflow/contrib/cmake/external/jsoncpp.cmake b/tensorflow/contrib/cmake/external/jsoncpp.cmake
index 1ef49681a66ae7276ba8a257cf81b4d3d6969c1d..5127d7e8f79abdda4516eb9f006e243b7438bc65 100644
--- a/tensorflow/contrib/cmake/external/jsoncpp.cmake
+++ b/tensorflow/contrib/cmake/external/jsoncpp.cmake
@@ -23,7 +23,7 @@ set(jsoncpp_LIBRARIES ${jsoncpp_BUILD}/obj/so/libjsoncpp.so)
 set(jsoncpp_INCLUDES ${jsoncpp_BUILD})
 
 if(WIN32)
-  set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/${CMAKE_BUILD_TYPE}/jsoncpp.lib)
+  set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/$(Configuration)/jsoncpp.lib)
 else()
   set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/libjsoncpp.a)
 endif()
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 05e9688d1f010714242c5c6086b017e802d8eecf..2b2bd47d1c95ca886469c525191c27f22d416c29 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -21,7 +21,9 @@ set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
 if(WIN32)
-  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+  set(png_STATIC_LIBRARIES 
+    debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
+    optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
 else()
   set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
 endif()
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index 6cd9c11750df4e37145a314fbcf841f880b9c8a4..d600d8c3c0d30ec517d0abc4bac94c588b5268d4 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -19,8 +19,10 @@ set(PROTOBUF_URL https://github.com/mrry/protobuf.git)  # Includes MSVC fix.
 set(PROTOBUF_TAG 1d2c7b6c7376f396c8c7dd9b6afd2d4f83f3cb05)
 
 if(WIN32)
-  set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/${CMAKE_BUILD_TYPE}/libprotobuf.lib)
-  set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/${CMAKE_BUILD_TYPE}/protoc.exe)
+  set(protobuf_STATIC_LIBRARIES 
+    debug ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobufd.lib
+    optimized ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobuf.lib)
+  set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/protoc.exe)
   set(PROTOBUF_ADDITIONAL_CMAKE_OPTIONS	-Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF -A x64)
 else()
   set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobuf.a)
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index 8cfde90438c6a3459d7ead44bea305fdc041bf2c..c8af611e1eaefdf135551940a66985a4d50b26ed 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -22,7 +22,8 @@ set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
 
 if(WIN32)
   set(zlib_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+      debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
+      optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
 else()
   set(zlib_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 6fd1ae08149f4c2ff40f391e4c1d452df09ac514..a048194a1973188dfe3bba88b2dd8b65a7a55b55 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -118,8 +118,10 @@ set(tf_proto_text_srcs
     "tensorflow/core/framework/types.proto"
     "tensorflow/core/framework/versions.proto"
     "tensorflow/core/lib/core/error_codes.proto"
+    "tensorflow/core/protobuf/cluster.proto"
     "tensorflow/core/protobuf/config.proto"
     "tensorflow/core/protobuf/debug.proto"
+    "tensorflow/core/protobuf/device_properties.proto"
     "tensorflow/core/protobuf/rewriter_config.proto"
     "tensorflow/core/protobuf/tensor_bundle.proto"
     "tensorflow/core/protobuf/saver.proto"
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 0c420a02534fbcf65a1ab4153e97b9229512d0e9..a71d9c5869b20375c3237a6b5770d5b4fe1e77c7 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -47,9 +47,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc"
       #"${tensorflow_source_dir}/tensorflow/contrib/ffmpeg/decode_audio_op.cc"
       #"${tensorflow_source_dir}/tensorflow/contrib/ffmpeg/encode_audio_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/bucketization_kernel.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/bucketization_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 2beb264a54eb24381a46e999e7efe90f226eca47..eae00ab8756cfb87d1b80a51947fbc894fa9e064 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -22,6 +22,7 @@ set(tf_op_lib_names
     "image_ops"
     "io_ops"
     "linalg_ops"
+    "lookup_ops"
     "logging_ops"
     "math_ops"
     "nn_ops"
@@ -70,7 +71,6 @@ GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir
 GENERATE_CONTRIB_OP_LIBRARY(framework_variable "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(input_pipeline "${tensorflow_source_dir}/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(image "${tensorflow_source_dir}/tensorflow/contrib/image/ops/image_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(layers_bucketization "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/bucketization_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(layers_sparse_feature_cross "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(memory_stats "${tensorflow_source_dir}/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nccl "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index eaa6a9a6ab343fbe7a7c27f416195e7bbe116764..3670e4678bdd0f2f01bb37b1e13f325d8c11a184 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -27,7 +27,6 @@
 
 # 1. Resolve the installed version of Python (for Python.h and python).
 # TODO(mrry): Parameterize the build script to enable Python 3 building.
-include(FindPythonInterp)
 if(NOT PYTHON_INCLUDE_DIR)
   set(PYTHON_NOT_FOUND false)
   exec_program("${PYTHON_EXECUTABLE}"
@@ -203,14 +202,17 @@ add_python_module("tensorflow/python/estimator")
 add_python_module("tensorflow/python/estimator/export")
 add_python_module("tensorflow/python/estimator/inputs")
 add_python_module("tensorflow/python/estimator/inputs/queues")
+add_python_module("tensorflow/python/feature_column")
 add_python_module("tensorflow/python/framework")
 add_python_module("tensorflow/python/grappler")
 add_python_module("tensorflow/python/kernel_tests")
+add_python_module("tensorflow/python/kernel_tests/distributions")
 add_python_module("tensorflow/python/layers")
 add_python_module("tensorflow/python/lib")
 add_python_module("tensorflow/python/lib/core")
 add_python_module("tensorflow/python/lib/io")
 add_python_module("tensorflow/python/ops")
+add_python_module("tensorflow/python/ops/distributions")
 add_python_module("tensorflow/python/ops/losses")
 add_python_module("tensorflow/python/platform")
 add_python_module("tensorflow/python/platform/default")
@@ -594,6 +596,7 @@ GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
+GENERATE_PYTHON_OP_LIB("lookup_ops")
 GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
@@ -621,8 +624,6 @@ GENERATE_PYTHON_OP_LIB("contrib_input_pipeline_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/input_pipeline/ops/gen_input_pipeline_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_image_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/image/ops/gen_image_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_layers_bucketization_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/layers/ops/gen_bucketization_op.py)
 GENERATE_PYTHON_OP_LIB("contrib_layers_sparse_feature_cross_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/layers/ops/gen_sparse_feature_cross_op.py)
 GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
@@ -862,9 +863,9 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
 if(WIN32)
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.dll
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
 else()
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 47289fd9d240e24776f7c6799a6bf25d5161f538..9385ac52e903e1f0f2436066f573af5359c46770 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -82,6 +82,13 @@ target_link_libraries(tensorflow PRIVATE
     tf_protos_cc
 )
 
+# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+# linking to the tensorflow library. Adding the following libraries fixes it.
+# See issue on github: https://github.com/tensorflow/tensorflow/issues/9593
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+    target_link_libraries(tensorflow PRIVATE gcc_s gcc)
+endif()
+
 if(WIN32)
   add_dependencies(tensorflow tensorflow_static)
 endif(WIN32)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index 9bd287d0d79b68278bc32453f528def6ce1ee829..b16a5eadb05da79f2d5a325fea4986e6728fe021 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,7 +44,7 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"deleting destructor|::internal::")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -141,17 +141,17 @@ def main():
           continue
         if not INCLUDE_RE.search(line):
           continue
-          
+
       if "deleting destructor" in line:
         # Some of the symbols convered by INCLUDEPRE_RE export deleting
         # destructor symbols, which is a bad idea.
         # So we filter out such symbols here.
         continue
-          
+
       if DATA_EXCLUDE_RE.search(line):
         def_fp.write("\t" + decorated + "\n")
       else:
-        def_fp.write("\t" + decorated + " DATA\n")      
+        def_fp.write("\t" + decorated + " DATA\n")
       taken.add(decorated)
   exit_code = proc.wait()
   if exit_code != 0:
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 9ebf94315b017d01176530e877a4d35168ec7c8e..60c0b42a796df7c05b67751dfe3f9f76ba12c9a3 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -82,7 +82,7 @@ tf_custom_op_py_library(
 
 cuda_py_test(
     name = "cudnn_rnn_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
     additional_deps = [
         ":cudnn_rnn_py",
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
index 6049d2afdab3e631ae91783e04472607ebc42683..86faf0cc854e94d808375b80d6e29d98711f506f 100644
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
@@ -67,7 +68,7 @@ limitations under the License.
  * TensorFlow is responsible for making sure the memory is alive long enough
  * and recycles afterwards.
  *
-*/
+ */
 namespace tensorflow {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -106,6 +107,7 @@ using perftools::gputools::DeviceMemory;
 using perftools::gputools::DeviceMemoryBase;
 using perftools::gputools::ScratchAllocator;
 using perftools::gputools::port::StatusOr;
+using strings::Printf;
 
 Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
   if (str == "rnn_relu") {
@@ -203,9 +205,10 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
 }
 
 inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
-  return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
-                                            static_cast<int>(s.code())),
-                                        s.error_message());
+  return s.ok() ? Status::OK()
+                : Status(static_cast<tensorflow::error::Code>(
+                             static_cast<int>(s.code())),
+                         s.error_message());
 }
 
 template <typename T>
@@ -244,8 +247,7 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return perftools::gputools::port::StatusOr<
-        perftools::gputools::DeviceMemory<uint8>>(
+    return StatusOr<DeviceMemory<uint8>>(
         AsDeviceMemory<uint8>(&temporary_memory));
   }
   int64 TotalByteSize() { return total_byte_size_; }
@@ -296,6 +298,43 @@ class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
   int output_index_;
 };
 
+// A helper to allocate persistent memory for Cudnn RNN models, which is
+// expected to live between kernel invocations.
+// This class is not thread-safe.
+class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
+ public:
+  CudnnRNNPersistentSpaceAllocator(OpKernelContext* context)
+      : context_(context) {}
+
+  virtual ~CudnnRNNPersistentSpaceAllocator() {}
+
+  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+    return std::numeric_limits<int64>::max();
+  }
+
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(
+      perftools::gputools::Stream* stream, int64 byte_size) override {
+    if (total_byte_size_ != 0) {
+      return Status(error::FAILED_PRECONDITION,
+                    "Persistent space allocator can only be called once");
+    }
+
+    Status allocation_status = context_->allocate_persistent(
+        DT_UINT8, TensorShape({byte_size}), &handle_, nullptr);
+    if (!allocation_status.ok()) {
+      return ToExecutorStatus(allocation_status);
+    }
+    total_byte_size_ += byte_size;
+    return AsDeviceMemory<uint8>(handle_.AccessTensor(context_));
+  }
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 total_byte_size_ = 0;
+  PersistentTensor handle_;
+  OpKernelContext* context_;  // not owned
+};
+
 struct CudnnModelTypes {
   RnnMode rnn_mode;
   TFRNNInputMode rnn_input_mode;
@@ -317,6 +356,16 @@ struct CudnnModelShapes {
   TensorShape input_shape;
   TensorShape output_shape;
   TensorShape hidden_state_shape;
+  // At present only fields related to cached RnnDescriptor are concerned.
+  bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
+    return num_layers == rhs.num_layers && input_size == rhs.input_size &&
+           num_units == rhs.num_units && dir_count == rhs.dir_count;
+  }
+  string RnnDescDebugString() {
+    return strings::Printf(
+        "[num_layers, input_size, num_units, dir_count]: [%d, %d, %d, %d]",
+        num_layers, input_size, num_units, dir_count);
+  }
 };
 
 // Extract and checks the forward input tensors, parameters, and shapes from the
@@ -399,11 +448,23 @@ void RestoreParams(const OpInputList params_input,
 
 }  // namespace
 
+// Note: all following kernels depend on a RnnDescriptor instance, which
+// according to Cudnn official doc should be kept around and reused across all
+// Cudnn kernels in the same model.
+// In Tensorflow, we don't pass the reference across different OpKernels,
+// rather, recreate it separately in each OpKernel, which does no cause issue:
+// CudnnDropoutDescriptor keeps a reference to a memory for
+// random number generator state. During recreation, this state is lost.
+// However, only forward-pass Cudnn APIs make use of the state.
+
 // A common base class for RNN kernels. It extracts common attributes and
 // shape validations.
 class CudnnRNNKernelCommon : public OpKernel {
  protected:
   CudnnRNNKernelCommon(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dropout", &dropout_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
     string str;
     OP_REQUIRES_OK(context, context->GetAttr("rnn_mode", &str));
     OP_REQUIRES_OK(context, ParseRNNMode(str, &model_types_.rnn_mode));
@@ -413,6 +474,10 @@ class CudnnRNNKernelCommon : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("direction", &str));
     OP_REQUIRES_OK(
         context, ParseRNNDirectionMode(str, &model_types_.rnn_direction_mode));
+    // Reset CudnnRnnDescriptor and related random number generate states in
+    // every Compute() call.
+    OP_REQUIRES_OK(context, ReadBoolFromEnvVar("TF_CUDNN_RESET_RND_GEN_STATE",
+                                               false, &reset_rnd_gen_state_));
   }
 
   bool HasInputC() const { return model_types_.HasInputC(); }
@@ -422,6 +487,9 @@ class CudnnRNNKernelCommon : public OpKernel {
     return model_types_.rnn_direction_mode;
   }
   CudnnModelTypes model_types() const { return model_types_; }
+  float dropout() const { return dropout_; }
+  uint64 seed() { return (static_cast<uint64>(seed_) << 32) | seed2_; }
+  bool ResetRndGenState() { return reset_rnd_gen_state_; }
 
   template <typename T>
   Status ExtractCudnnRNNParamsInfo(OpKernelContext* context,
@@ -448,11 +516,14 @@ class CudnnRNNKernelCommon : public OpKernel {
     RnnInputMode input_mode;
     TF_RETURN_IF_ERROR(
         ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode));
+
     auto* stream = context->op_device_context()->stream();
+    // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
+    // random number generator, therefore set state_allocator to nullptr.
     auto rnn_desc_s = stream->parent()->createRnnDescriptor(
         num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
-        rnn_mode(), ToDataType<T>::value, 0.f /*dropout*/, 0 /*seed*/,
-        nullptr /*state_allocator*/);
+        rnn_mode(), ToDataType<T>::value, dropout(), seed(),
+        nullptr /* state_allocator */);
     if (!rnn_desc_s.ok()) {
       return FromExecutorStatus(rnn_desc_s);
     }
@@ -461,6 +532,11 @@ class CudnnRNNKernelCommon : public OpKernel {
   }
 
  private:
+  int seed_;
+  int seed2_;
+  float dropout_;
+  bool reset_rnd_gen_state_;
+
   CudnnModelTypes model_types_;
 };
 
@@ -560,9 +636,8 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
         context->set_output(i, input.Slice(start, end));
       } else {
         Tensor* output = nullptr;
-        OP_REQUIRES_OK(
-            context,
-            context->allocate_output(i, TensorShape({width, height}), &output));
+        OP_REQUIRES_OK(context, context->allocate_output(
+                                    i, TensorShape({width, height}), &output));
         DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
             input_ptr, rnn_desc->ParamsWeightRegions()[i].offset,
             size_in_bytes);
@@ -571,14 +646,17 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
       }
     }
 
-    CHECK(num_params_ == rnn_desc->ParamsBiasRegions().size())
-        << "Number of params mismatch. Expected " << num_params_ << ", got "
-        << rnn_desc->ParamsBiasRegions().size();
+    OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(),
+                errors::InvalidArgument("Number of params mismatch. Expected ",
+                                        num_params_, ", got ",
+                                        rnn_desc->ParamsBiasRegions().size()));
     for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) {
       int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size;
       int64 size = size_in_bytes / sizeof(T);
-      CHECK(size == num_units) << "Params size mismatch. Expected " << num_units
-                               << ", got " << size;
+      OP_REQUIRES(context, size == num_units,
+                  errors::InvalidArgument("Params size mismatch. Expected ",
+                                          num_units, ", got ", size));
+
       // If data is aligned, use slice view to avoid expensive memcpy.
       bool start_aligned =
           rnn_desc->ParamsBiasRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES == 0;
@@ -698,16 +776,32 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                   model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): add dropout support.
     // TODO(zhengxq): cache the descriptor so we don't have to create them all
     // the time.
     auto data_type = ToDataType<T>::value;
-    auto rnn_desc_s = executor->createRnnDescriptor(
-        model_shapes.num_layers, model_shapes.num_units,
-        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
-    OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-    auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    {
+      mutex_lock l(mu_);
+      if (model_shapes_ == nullptr) {
+        model_shapes_.reset(new CudnnModelShapes(model_shapes));
+      } else {
+        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
+                    errors::InvalidArgument(
+                        "Incompatible rnn model shapes inferred: expecting ",
+                        model_shapes_->RnnDescDebugString(), ", getting ",
+                        model_shapes.RnnDescDebugString(), "."));
+      }
+      if (rnn_desc_ == nullptr || ResetRndGenState()) {
+        dropout_state_allocator_.reset(
+            new CudnnRNNPersistentSpaceAllocator(context));
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes_->num_layers, model_shapes_->num_units,
+            model_shapes_->input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(),
+            dropout_state_allocator_.get());
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
+    }
 
     auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
@@ -753,21 +847,30 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status =
-        stream
-            ->ThenRnnForward(
-                *rnn_desc, *input_desc, input_data, *hidden_state_desc,
-                input_h_data, *hidden_state_desc, input_c_data, params_data,
-                *output_desc, &output_data, *hidden_state_desc, &output_h_data,
-                *hidden_state_desc, &output_c_data, is_training_,
-                &reserve_space_allocator, &workspace_allocator)
-            .ok();
+    bool launch_status = false;
+    {
+      mutex_lock l(mu_);
+      launch_status =
+          stream
+              ->ThenRnnForward(
+                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
+                  input_h_data, *hidden_state_desc, input_c_data, params_data,
+                  *output_desc, &output_data, *hidden_state_desc,
+                  &output_h_data, *hidden_state_desc, &output_c_data,
+                  is_training_, &reserve_space_allocator, &workspace_allocator)
+              .ok();
+    }
     OP_REQUIRES(context, launch_status,
                 errors::Internal("Failed to call ThenRnnForward"));
   }
 
  private:
+  mutex mu_;
   bool is_training_;
+  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
+  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
+      GUARDED_BY(mu_);
 };
 
 REGISTER_KERNEL_BUILDER(
@@ -808,9 +911,9 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* output_h = nullptr;
     OP_REQUIRES_OK(context, context->input("output_h", &output_h));
     OP_REQUIRES(context, output_h->shape() == hidden_state_shape,
-                errors::InvalidArgument("Invalid output_h shape: ",
-                                        output_h->shape().DebugString(), " ",
-                                        hidden_state_shape.DebugString()));
+                errors::InvalidArgument(
+                    "Invalid output_h shape: ", output_h->shape().DebugString(),
+                    " ", hidden_state_shape.DebugString()));
     const Tensor* output_c = nullptr;
     if (HasInputC()) {
       // Only LSTM uses input_c and output_c. So for all other models, we only
@@ -881,15 +984,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                   model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): add dropout support.
     // TODO(zhengxq): cache the descriptor so we don't have to create them all
     // the time.
-    auto rnn_desc_s = executor->createRnnDescriptor(
-        model_shapes.num_layers, model_shapes.num_units,
-        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
-    OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-    auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    {
+      mutex_lock l(mu_);
+      if (model_shapes_ == nullptr) {
+        model_shapes_.reset(new CudnnModelShapes(model_shapes));
+      } else {
+        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
+                    errors::InvalidArgument(
+                        "Incompatible rnn model shapes inferred: expecting ",
+                        model_shapes_->RnnDescDebugString(), ", getting ",
+                        model_shapes.RnnDescDebugString(), "."));
+      }
+
+      if (rnn_desc_ == nullptr || ResetRndGenState()) {
+        dropout_state_allocator_.reset(
+            new CudnnRNNPersistentSpaceAllocator(context));
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes.num_layers, model_shapes.num_units,
+            model_shapes.input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(),
+            dropout_state_allocator_.get());
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
+    }
 
     auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
@@ -939,21 +1059,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status =
-        stream
-            ->ThenRnnBackward(
-                *rnn_desc, *input_desc, input_data, *hidden_state_desc,
-                input_h_data, *hidden_state_desc, input_c_data, params_data,
-                *output_desc, output_data, *hidden_state_desc, output_h_data,
-                *hidden_state_desc, output_c_data, output_backprop_data,
-                output_h_backprop_data, output_c_backprop_data,
-                &input_backprop_data, &input_h_backprop_data,
-                &input_c_backprop_data, &params_backprop_data,
-                &reserve_space_uint8, &workspace_allocator)
-            .ok();
+    bool launch_status = false;
+    {
+      mutex_lock l(mu_);
+      launch_status =
+          stream
+              ->ThenRnnBackward(
+                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
+                  input_h_data, *hidden_state_desc, input_c_data, params_data,
+                  *output_desc, output_data, *hidden_state_desc, output_h_data,
+                  *hidden_state_desc, output_c_data, output_backprop_data,
+                  output_h_backprop_data, output_c_backprop_data,
+                  &input_backprop_data, &input_h_backprop_data,
+                  &input_c_backprop_data, &params_backprop_data,
+                  &reserve_space_uint8, &workspace_allocator)
+              .ok();
+    }
     OP_REQUIRES(context, launch_status,
                 errors::Internal("Failed to call ThenRnnBackward"));
   }
+
+ private:
+  mutex mu_;
+  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
+  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
+      GUARDED_BY(mu_);
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index 58025f7b1a5d592f54ec63f5ce36c3c7a7611c0d..2c631b064b559e19d767297e8ba5bfda06ab0880 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -35,6 +35,9 @@ input_mode: Indicate whether there is a linear projection between the input and
     input_size == num_units; otherwise, it implies 'linear_input'.
 direction: Indicates whether a bidirectional model will be used.
     dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
 )doc";
 
 constexpr auto kCudnnRNNParamsBuffer = R"doc(
@@ -77,6 +80,9 @@ REGISTER_OP("CudnnRNNParamsSize")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .Output("params_size: S")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(1));
@@ -119,6 +125,7 @@ REGISTER_OP("CudnnRNN")
     .Input("input_h: T")
     .Input("input_c: T")
     .Input("params: T")
+    .SetIsStateful()
     .Output("output: T")
     .Output("output_h: T")
     .Output("output_c: T")
@@ -127,7 +134,7 @@ REGISTER_OP("CudnnRNN")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
-    .Attr("dropout: float")
+    .Attr("dropout: float = 0.0")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("is_training: bool = true")
@@ -158,7 +165,8 @@ REGISTER_OP("CudnnRNN")
 Computes the RNN from the input and initial states, with respect to the params
 buffer.
 )doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
+                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
+                         R"doc(
 is_training: Indicates whether this operation is used for inferenece or
     training.
 reserve_space: an opaque tensor that can be used in backprop calculation. It
@@ -177,6 +185,7 @@ REGISTER_OP("CudnnRNNBackprop")
     .Input("output_h_backprop: T")
     .Input("output_c_backprop: T")
     .Input("reserve_space: T")
+    .SetIsStateful()
     .Output("input_backprop: T")
     .Output("input_h_backprop: T")
     .Output("input_c_backprop: T")
@@ -185,6 +194,9 @@ REGISTER_OP("CudnnRNNBackprop")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](InferenceContext* c) {
       auto input_shape = c->input(0);
       auto input_h_shape = c->input(1);
@@ -199,7 +211,8 @@ REGISTER_OP("CudnnRNNBackprop")
     .Doc(strings::StrCat(R"doc(
 Compute the backprop of both data and weights in a RNN.
 )doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
+                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
+                         R"doc(
 output_backprop: A 3-D tensor with the same shape as output in the forward pass.
 output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
     pass.
@@ -228,6 +241,9 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused));
@@ -268,6 +284,9 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
@@ -281,7 +300,6 @@ upcoming training or inferences.
 num_params: number of parameter sets for all layers.
     Each layer may contain multiple parameter sets, with each set consisting of
     a weight matrix and a bias vector.
-)doc",
-                         kCudnnRNNCommonAttrs));
+)doc", kCudnnRNNCommonAttrs));
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index 945791578ac48bf24db721182e191a78a7643c6c..08ec3076e49696602f729772e8dc3686c281cbaa 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -38,15 +38,25 @@ from tensorflow.python.training import saver as saver_lib
 
 class CudnnRNNTest(TensorFlowTestCase):
 
-  def _CreateModel(self, rnn_mode, num_layers, num_units, input_size):
+  def _CreateModel(self,
+                   rnn_mode,
+                   num_layers,
+                   num_units,
+                   input_size,
+                   input_mode="linear_input",
+                   dropout=0.):
     if rnn_mode == "lstm":
-      model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnLSTM(
+          num_layers, num_units, input_size, dropout=dropout)
     elif rnn_mode == "gru":
-      model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnGRU(
+          num_layers, num_units, input_size, dropout=dropout)
     elif rnn_mode == "rnn_tanh":
-      model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnRNNTanh(
+          num_layers, num_units, input_size, dropout=dropout)
     elif rnn_mode == "rnn_relu":
-      model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnRNNRelu(
+          num_layers, num_units, input_size, dropout=dropout)
     else:
       raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
     return model
@@ -58,9 +68,8 @@ class CudnnRNNTest(TensorFlowTestCase):
       params: a Variable for weight and bias parameters.
       model: a CudnnRNN model.
     """
-    params_saveable = cudnn_rnn_ops.RNNParamsSaveable(model.params_to_canonical,
-                                                      model.canonical_to_params,
-                                                      params)
+    params_saveable = cudnn_rnn_ops.RNNParamsSaveable(
+        model.params_to_canonical, model.canonical_to_params, [params])
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
 
   def _testSaveRestoreVariable(self, rnn_mode):
@@ -175,9 +184,12 @@ class CudnnRNNTest(TensorFlowTestCase):
         self._testOneLSTMParamsSize(num_layers, num_units, input_size)
 
   def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
-                              batch_size, seq_length, dir_count, expected,
-                              tolerance):
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+                              batch_size, seq_length, dir_count, dropout,
+                              expected, tolerance):
+    random_seed.set_random_seed(5678)
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
+                              input_mode="auto_select",
+                              dropout=dropout)
     has_input_c = (rnn_mode == "lstm")
     params_size_t = model.params_size()
     input_data = array_ops.ones([seq_length, batch_size, input_size])
@@ -207,18 +219,24 @@ class CudnnRNNTest(TensorFlowTestCase):
     with self.test_session(use_gpu=True) as sess:
       sess.run(variables.global_variables_initializer())
       total_sum_v = sess.run([total_sum])
+
       self.assertAllClose(
           total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleInference(self):
+    # Cudnn scales result for dropout during training, therefore dropout has no
+    # impact for inference results.
+    # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
+    # demonstrative of the dropout-invariant nature of CudnnRnn.)
     test_configs = [
-        [
-            "lstm",
-            231833.22,
-            1e-2,
-            {
+        {
+            "rnn_mode": "lstm",
+            "dropout": [0., 0.5, 1.],
+            "expected": 231833.22,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 4,
                 "num_units": 200,
                 "input_size": 200,
@@ -226,12 +244,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 10,
                 "dir_count": 1,
             },
-        ],
-        [
-            "gru",
-            56000,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "gru",
+            "dropout": [0., 0.5, 1.],
+            "expected": 56000,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 4,
                 "num_units": 200,
                 "input_size": 200,
@@ -239,12 +258,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 10,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_tanh",
-            56000,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "rnn_tanh",
+            "dropout": [0., 0.5, 1.],
+            "expected": 56000,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 4,
                 "num_units": 200,
                 "input_size": 200,
@@ -252,12 +272,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 10,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_relu",
-            130688,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "rnn_relu",
+            "dropout": [0., 0.5, 1.],
+            "expected": 130688,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 8,
                 "input_size": 4,
@@ -265,24 +286,32 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 2,
                 "dir_count": 1,
             },
-        ],
+        },
     ]
     with ops.Graph().as_default():
       for config in test_configs:
-        rnn_mode = config[0]
-        expected = config[1]
-        tolerance = config[2]
-        shapes = config[3]
-        self._testOneSimpleInference(rnn_mode, shapes["num_layers"],
-                                     shapes["num_units"], shapes["input_size"],
-                                     shapes["batch_size"], shapes["seq_length"],
-                                     shapes["dir_count"], expected, tolerance)
+        rnn_mode = config["rnn_mode"]
+        dropout_list = config.get("dropout", [0.])
+        expected = config["expected"]
+        tolerance = config["tolerance"]
+        shape = config["shape"]
+        for dropout in dropout_list:
+          self._testOneSimpleInference(
+              rnn_mode, shape["num_layers"], shape["num_units"],
+              shape["input_size"], shape["batch_size"], shape["seq_length"],
+              shape["dir_count"], dropout, expected, tolerance)
 
   def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
-                             batch_size, seq_length, dir_count, tolerance):
+                             batch_size, seq_length, dir_count, dropout,
+                             tolerance):
+    # Gradient checking runs two forward ops with almost the same input. Need to
+    # make sure the drop patterns across the two runs are the same.
+    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
+    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
     has_input_c = (rnn_mode == "lstm")
     random_seed.set_random_seed(1234)
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
+                              dropout=dropout)
     params_size_t = model.params_size()
     input_data = variables.Variable(
         random_ops.random_uniform([seq_length, batch_size, input_size]))
@@ -295,6 +324,7 @@ class CudnnRNNTest(TensorFlowTestCase):
       input_c = variables.Variable(
           random_ops.random_uniform(
               [num_layers * dir_count, batch_size, num_units]))
+
       output, output_h, output_c = model(
           input_data=input_data,
           input_h=input_h,
@@ -323,18 +353,22 @@ class CudnnRNNTest(TensorFlowTestCase):
       sess.run(variables.global_variables_initializer())
       all_inputs = [entry[0] for entry in inputs_and_shapes]
       all_shapes = [entry[1] for entry in inputs_and_shapes]
+
       err = gradient_checker.compute_gradient_error(all_inputs, all_shapes,
                                                     total_sum, [1])
+
       self.assertLess(err, tolerance)
+      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTraining(self):
     test_configs = [
-        [
-            "lstm",
-            1e-2,
-            {
+        {
+            "rnn_mode": "lstm",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -342,11 +376,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
-        [
-            "gru",
-            4e-3,
-            {
+        },
+        {
+            "rnn_mode": "gru",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 4e-3,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -354,11 +389,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_tanh",
-            5e-3,
-            {
+        },
+        {
+            "rnn_mode": "rnn_tanh",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 5e-3,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -366,11 +402,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_relu",
-            3e-1,
-            {
+        },
+        {
+            "rnn_mode": "rnn_relu",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 4e-1,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -378,17 +415,19 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
+        },
     ]
     with ops.Graph().as_default():
       for config in test_configs:
-        rnn_mode = config[0]
-        tolerance = config[1]
-        shape = config[2]
-        self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    shape["dir_count"], tolerance)
+        rnn_mode = config["rnn_mode"]
+        dropout_list = config.get("dropout", [0.])
+        tolerance = config["tolerance"]
+        shape = config["shape"]
+        for dropout in dropout_list:
+          self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
+                                      shape["num_units"], shape["input_size"],
+                                      shape["batch_size"], shape["seq_length"],
+                                      shape["dir_count"], dropout, tolerance)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 9ab337df15c09639dd70215666b6d1ed97cabbc0..4f70b275e8f6fc32df16b7dffcf16cfa551a63dd 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -23,13 +23,13 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
 
-
 _cudnn_rnn_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
 
@@ -48,8 +48,8 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
   def __init__(self,
                params_to_canonical,
                canonical_to_params,
-               name="params_canonical",
-               *param_variables):
+               param_variables,
+               name="params_canonical"):
     """Creates a RNNParamsSaveable object.
 
        RNNParamsSaveable is saveable/restorable in a checkpoint file and is used
@@ -83,11 +83,11 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           must return a scalar (e.g. in the case of cuDNN) or a tuple. This
           function could be _CudnnRNN.canonical_to_params() or a
           user-defined function.
-      name: the name of the RNNParamsSaveable object.
-      *param_variables: a list of Variables for parameters in a specific form.
+      param_variables: a list of Variables for parameters in a specific form.
           For cuDNN RNN ops, this is a single merged variable for both weights
           and biases; for other RNN ops, this might be multiple unmerged or
           partially merged variables respectively for weights and biases.
+      name: the name of the RNNParamsSaveable object.
     """
     # There is only a single merged parameter variable for cuDNN when saving.
     weights, biases = params_to_canonical(param_variables[0])
@@ -110,12 +110,12 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     if not isinstance(params, tuple):
       params = (params,)
     assign_ops = [
-        state_ops.assign(
-            variable, param, validate_shape=False)
+        state_ops.assign(variable, param, validate_shape=False)
         for variable, param in zip(self._variables, params)
     ]
     return control_flow_ops.group(*assign_ops)
 
+
 _cudnn_rnn_common_doc_string = """
   Cudnn RNN has an opaque parameter buffer that can be used for inference and
   training. But it is possible that the layout of the parameter buffers
@@ -160,11 +160,10 @@ class _CudnnRNN(object):
                num_layers,
                num_units,
                input_size,
-               input_mode="auto_select",
+               input_mode="linear_input",
                direction="unidirectional",
                dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
     """Creates a CudnnRNN model from model spec.
 
     Args:
@@ -175,16 +174,18 @@ class _CudnnRNN(object):
       input_size: the size of the input, it could be different from the
           num_units.
       input_mode: indicate whether there is a linear projection between the
-          input and The actual computation before the first layer. It could be
-          'skip_input', 'linear_input' or 'auto_select'.
+          input and the actual computation before the first layer. It could be
+          'linear_input', 'skip_input' or 'auto_select'.
+          'linear_input' (default) always applies a linear projection of input
+          onto RNN hidden state. (standard RNN behavior).
           'skip_input' is only allowed when input_size == num_units;
           'auto_select' implies 'skip_input' when input_size == num_units;
           otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+          for behavior.
     """
     self._num_layers = num_layers
     self._num_units = num_units
@@ -193,8 +194,10 @@ class _CudnnRNN(object):
     self._input_mode = input_mode
     self._direction = direction
     self._dropout = dropout
-    self._seed = seed
-    self._seed2 = seed2
+    # get graph and op seed.
+    self._seed, self._seed2 = random_seed.get_seed(seed)
+    if self._seed is None and self._seed2 is None:
+      self._seed, self._seed2 = 0, 0
 
   def params_size(self):
     """Calculates the size of the opaque parameter buffer needed for this model.
@@ -208,6 +211,9 @@ class _CudnnRNN(object):
         input_size=self._input_size,
         T=dtypes.float32,
         S=dtypes.int32,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
         rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
         direction=self._direction)[0]
@@ -258,6 +264,9 @@ class _CudnnRNN(object):
         num_units=self._num_units,
         input_size=self._input_size,
         params=params,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
         num_params=self._num_layers * self._NUM_PARAMS_PER_LAYER,
         rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
@@ -280,6 +289,9 @@ class _CudnnRNN(object):
         input_size=self._input_size,
         weights=weights,
         biases=biases,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
         rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
         direction=self._direction)
@@ -299,8 +311,7 @@ class CudnnLSTM(_CudnnRNN):
                input_mode="auto_select",
                direction="unidirectional",
                dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
     """Creates a Cudnn LSTM model from model spec.
 
     Args:
@@ -317,8 +328,7 @@ class CudnnLSTM(_CudnnRNN):
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the seed used for initializing dropout.
     """
     super(CudnnLSTM, self).__init__(
         "lstm",
@@ -328,8 +338,7 @@ class CudnnLSTM(_CudnnRNN):
         input_mode=input_mode,
         direction=direction,
         dropout=dropout,
-        seed=seed,
-        seed2=seed2)
+        seed=seed)
 
   def __call__(self, input_data, input_h, input_c, params, is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
@@ -346,11 +355,8 @@ class CudnnLSTM(_CudnnRNN):
       output_h: the final state for h.
       output_c: the final state for c.
     """
-    output, output_h, output_c = super(CudnnLSTM, self).__call__(input_data,
-                                                                 input_h,
-                                                                 input_c,
-                                                                 params,
-                                                                 is_training)
+    output, output_h, output_c = super(CudnnLSTM, self).__call__(
+        input_data, input_h, input_c, params, is_training=is_training)
     return (output, output_h, output_c)
 
 
@@ -365,8 +371,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
                input_mode="auto_select",
                direction="unidirectional",
                dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
     """Creates a Cudnn RNN model from model without hidden-state C.
 
     Args:
@@ -383,8 +388,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the seed used for initializing dropout.
     """
     super(_CudnnRNNNoInputC, self).__init__(
         self._rnn_mode,
@@ -394,8 +398,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
         input_mode=input_mode,
         direction=direction,
         dropout=dropout,
-        seed=seed,
-        seed2=seed2)
+        seed=seed)
 
   def __call__(self, input_data, input_h, params, is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
@@ -411,7 +414,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       output_h: the final state for h.
     """
     output, output_h, _ = super(_CudnnRNNNoInputC, self).__call__(
-        input_data, input_h, None, params, is_training=True)
+        input_data, input_h, None, params, is_training=is_training)
     return (output, output_h)
 
 
@@ -459,6 +462,9 @@ def _cudnn_rnn_backward(op, *grad):
       output_h_backprop=grad[1],
       output_c_backprop=grad[2],
       reserve_space=op.outputs[3],
+      dropout=op.get_attr("dropout"),
+      seed=op.get_attr("seed"),
+      seed2=op.get_attr("seed2"),
       rnn_mode=op.get_attr("rnn_mode"),
       input_mode=op.get_attr("input_mode"),
       direction=op.get_attr("direction"))
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index f55859261b3677cba29e8c5a020341a18f94bdac..8dea2763f2946bea9a4b7ef00353b10560fc700c 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:special_math_ops",
+        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -54,6 +55,7 @@ py_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:special_math_ops",
+        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -191,38 +193,6 @@ cuda_py_test(
     tags = ["notap"],  # http://b/30441813
 )
 
-cuda_py_test(
-    name = "bernoulli_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bernoulli_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "beta_test",
-    size = "small",
-    srcs = ["python/kernel_tests/beta_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "binomial_test",
     size = "small",
@@ -236,24 +206,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "categorical_test",
-    size = "small",
-    srcs = ["python/kernel_tests/categorical_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-    ],
-)
-
 cuda_py_test(
     name = "chi2_test",
     srcs = ["python/kernel_tests/chi2_test.py"],
@@ -285,66 +237,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "dirichlet_test",
-    size = "small",
-    srcs = ["python/kernel_tests/dirichlet_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "dirichlet_multinomial_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/dirichlet_multinomial_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "exponential_test",
-    srcs = ["python/kernel_tests/exponential_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "gamma_test",
-    srcs = ["python/kernel_tests/gamma_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "geometric_test",
     size = "small",
@@ -377,23 +269,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "laplace_test",
-    srcs = ["python/kernel_tests/laplace_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "multinomial_test",
-    srcs = ["python/kernel_tests/multinomial_test.py"],
+    name = "mvn_diag_test",
+    size = "small",
+    srcs = ["python/kernel_tests/mvn_diag_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -402,14 +280,15 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
 )
 
 cuda_py_test(
-    name = "mvn_diag_test",
-    size = "small",
-    srcs = ["python/kernel_tests/mvn_diag_test.py"],
+    name = "mvn_diag_plus_low_rank_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/mvn_diag_plus_low_rank_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -424,9 +303,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "mvn_diag_plus_low_rank_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/mvn_diag_plus_low_rank_test.py"],
+    name = "mvn_full_covariance_test",
+    size = "small",
+    srcs = ["python/kernel_tests/mvn_full_covariance_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -494,24 +373,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "normal_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/normal_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
-)
-
 cuda_py_test(
     name = "poisson_test",
     size = "small",
@@ -545,21 +406,19 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "student_t_test",
-    size = "small",
-    srcs = ["python/kernel_tests/student_t_test.py"],
+    name = "vector_laplace_diag_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/vector_laplace_diag_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
-    tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
 cuda_py_test(
@@ -578,22 +437,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "uniform_test",
-    size = "small",
-    srcs = ["python/kernel_tests/uniform_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-    ],
-)
-
 cuda_py_test(
     name = "wishart_test",
     size = "small",
@@ -612,18 +455,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "kullback_leibler_test",
-    size = "small",
-    srcs = ["python/kernel_tests/kullback_leibler_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "normal_conjugate_posteriors_test",
     size = "small",
@@ -749,22 +580,6 @@ cuda_py_test(
     tags = ["no_pip"],
 )
 
-cuda_py_test(
-    name = "special_math_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/special_math_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
-)
-
 cuda_py_test(
     name = "distribution_util_test",
     size = "small",
@@ -813,25 +628,6 @@ filegroup(
 
 # === Bijector Tests ==========================================================
 
-cuda_py_test(
-    name = "bijector_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bijectors/bijector_test.py"],
-    additional_deps = [
-        ":bijectors_py",
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "//tensorflow/contrib/linalg:linalg_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "conditional_bijector_test",
     size = "small",
@@ -947,25 +743,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "identity_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bijectors/identity_test.py"],
-    additional_deps = [
-        ":bijectors_py",
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "//tensorflow/contrib/linalg:linalg_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "inline_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 257aefa8572351f00ddcb77d2a49c4d79d660826..1fddad53689a0d74d00c1f210d81b83975fb1d37 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -15,74 +15,6 @@
 """Classes representing statistical distributions and ops for working with them.
 
 See the @{$python/contrib.distributions} guide.
-
-## Distribution Object
-@@ReparameterizationType
-@@Distribution
-
-## Individual Distributions
-@@Binomial
-@@Bernoulli
-@@BernoulliWithSigmoidProbs
-@@Beta
-@@BetaWithSoftplusConcentration
-@@Categorical
-@@Chi2
-@@Chi2WithAbsDf
-@@Deterministic
-@@VectorDeterministic
-@@Exponential
-@@ExponentialWithSoftplusRate
-@@Gamma
-@@GammaWithSoftplusConcentrationRate
-@@Geometric
-@@InverseGamma
-@@InverseGammaWithSoftplusConcentrationRate
-@@Laplace
-@@LaplaceWithSoftplusScale
-@@Logistic
-@@NegativeBinomial
-@@Normal
-@@NormalWithSoftplusScale
-@@Poisson
-@@StudentT
-@@StudentTWithAbsDfSoftplusScale
-@@Uniform
-
-@@MultivariateNormalDiag
-@@MultivariateNormalTriL
-@@MultivariateNormalDiagPlusLowRank
-@@MultivariateNormalDiagWithSoftplusScale
-
-@@Dirichlet
-@@DirichletMultinomial
-@@Multinomial
-@@WishartCholesky
-@@WishartFull
-
-@@TransformedDistribution
-@@QuantizedDistribution
-
-@@Mixture
-
-@@ExpRelaxedOneHotCategorical
-@@OneHotCategorical
-@@RelaxedBernoulli
-@@RelaxedOneHotCategorical
-
-## Kullback-Leibler Divergence
-@@kl
-@@RegisterKL
-
-## Helper Functions
-@@matrix_diag_transform
-@@normal_conjugates_known_scale_posterior
-@@normal_conjugates_known_scale_predictive
-@@softplus_inverse
-
-## Functions for statistics of samples
-@@percentile
-
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -91,33 +23,22 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops.bernoulli import *
-from tensorflow.contrib.distributions.python.ops.beta import *
 from tensorflow.contrib.distributions.python.ops.binomial import *
-from tensorflow.contrib.distributions.python.ops.categorical import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
 from tensorflow.contrib.distributions.python.ops.conditional_distribution import *
 from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
 from tensorflow.contrib.distributions.python.ops.deterministic import *
-from tensorflow.contrib.distributions.python.ops.dirichlet import *
-from tensorflow.contrib.distributions.python.ops.dirichlet_multinomial import *
-from tensorflow.contrib.distributions.python.ops.distribution import *
 from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
 from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
-from tensorflow.contrib.distributions.python.ops.exponential import *
-from tensorflow.contrib.distributions.python.ops.gamma import *
 from tensorflow.contrib.distributions.python.ops.geometric import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
-from tensorflow.contrib.distributions.python.ops.kullback_leibler import *
-from tensorflow.contrib.distributions.python.ops.laplace import *
 from tensorflow.contrib.distributions.python.ops.logistic import *
 from tensorflow.contrib.distributions.python.ops.mixture import *
-from tensorflow.contrib.distributions.python.ops.multinomial import *
 from tensorflow.contrib.distributions.python.ops.mvn_diag import *
 from tensorflow.contrib.distributions.python.ops.mvn_diag_plus_low_rank import *
+from tensorflow.contrib.distributions.python.ops.mvn_full_covariance import *
 from tensorflow.contrib.distributions.python.ops.mvn_tril import *
 from tensorflow.contrib.distributions.python.ops.negative_binomial import *
-from tensorflow.contrib.distributions.python.ops.normal import *
 from tensorflow.contrib.distributions.python.ops.normal_conjugate_posteriors import *
 from tensorflow.contrib.distributions.python.ops.onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.poisson import *
@@ -125,10 +46,23 @@ from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
-from tensorflow.contrib.distributions.python.ops.student_t import *
-from tensorflow.contrib.distributions.python.ops.transformed_distribution import *
-from tensorflow.contrib.distributions.python.ops.uniform import *
+from tensorflow.contrib.distributions.python.ops.vector_laplace_diag import *
 from tensorflow.contrib.distributions.python.ops.wishart import *
+from tensorflow.python.ops.distributions.bernoulli import *
+from tensorflow.python.ops.distributions.beta import *
+from tensorflow.python.ops.distributions.categorical import *
+from tensorflow.python.ops.distributions.dirichlet import *
+from tensorflow.python.ops.distributions.dirichlet_multinomial import *
+from tensorflow.python.ops.distributions.distribution import *
+from tensorflow.python.ops.distributions.exponential import *
+from tensorflow.python.ops.distributions.gamma import *
+from tensorflow.python.ops.distributions.kullback_leibler import *
+from tensorflow.python.ops.distributions.laplace import *
+from tensorflow.python.ops.distributions.multinomial import *
+from tensorflow.python.ops.distributions.normal import *
+from tensorflow.python.ops.distributions.student_t import *
+from tensorflow.python.ops.distributions.transformed_distribution import *
+from tensorflow.python.ops.distributions.uniform import *
 
 # pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
 
@@ -140,6 +74,73 @@ _allowed_symbols = [
     'ConditionalTransformedDistribution',
     'FULLY_REPARAMETERIZED',
     'NOT_REPARAMETERIZED',
+    'Affine',
+    'AffineLinearOperator',
+    'Bijector',
+    'Chain',
+    'CholeskyOuterProduct',
+    'Exp',
+    'Identity',
+    'Inline',
+    'Invert',
+    'PowerTransform',
+    'SigmoidCentered',
+    'SoftmaxCentered',
+    'Softplus',
+    'ReparameterizationType',
+    'Distribution',
+    'Binomial',
+    'Bernoulli',
+    'BernoulliWithSigmoidProbs',
+    'Beta',
+    'BetaWithSoftplusConcentration',
+    'Categorical',
+    'Chi2',
+    'Chi2WithAbsDf',
+    'Deterministic',
+    'VectorDeterministic',
+    'Exponential',
+    'ExponentialWithSoftplusRate',
+    'Gamma',
+    'GammaWithSoftplusConcentrationRate',
+    'Geometric',
+    'InverseGamma',
+    'InverseGammaWithSoftplusConcentrationRate',
+    'Laplace',
+    'LaplaceWithSoftplusScale',
+    'Logistic',
+    'NegativeBinomial',
+    'Normal',
+    'NormalWithSoftplusScale',
+    'Poisson',
+    'StudentT',
+    'StudentTWithAbsDfSoftplusScale',
+    'Uniform',
+    'MultivariateNormalDiag',
+    'MultivariateNormalFullCovariance',
+    'MultivariateNormalTriL',
+    'MultivariateNormalDiagPlusLowRank',
+    'MultivariateNormalDiagWithSoftplusScale',
+    'Dirichlet',
+    'DirichletMultinomial',
+    'Multinomial',
+    'VectorLaplaceDiag',
+    'WishartCholesky',
+    'WishartFull',
+    'TransformedDistribution',
+    'QuantizedDistribution',
+    'Mixture',
+    'ExpRelaxedOneHotCategorical',
+    'OneHotCategorical',
+    'RelaxedBernoulli',
+    'RelaxedOneHotCategorical',
+    'kl_divergence',
+    'RegisterKL',
+    'matrix_diag_transform',
+    'normal_conjugates_known_scale_posterior',
+    'normal_conjugates_known_scale_predictive',
+    'softplus_inverse',
+    'percentile'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
index 13554f7664279f2c8a208b5098f515ec252c2ab6..e8fd6aa2f73fa3457333483111379f0d987801ff 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
@@ -23,9 +23,9 @@ import itertools
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
index 994e21dd487468269b2deae7632c56fed97664db..20e754308449af3f0399101f4ea1bb47b3356424 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.contrib.distributions.python.ops.bijectors.chain import Chain
 from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
index 267e4ad3509d6760e02ec420eef7f075b24a6390..0ff35304283fce9ce3f9e5d31b1258394e384d7b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
@@ -19,11 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
-from tensorflow.contrib.distributions.python.ops import transformed_distribution as transformed_distribution_lib
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
index c30ce60cacc53321fd4c765afacfd6232b4c904f..9970c0b4d86afda188d9401ebaf3c98d3fffbfdf 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_bijective_and_finite
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
index 267e4ad3509d6760e02ec420eef7f075b24a6390..0ff35304283fce9ce3f9e5d31b1258394e384d7b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
@@ -19,11 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
-from tensorflow.contrib.distributions.python.ops import transformed_distribution as transformed_distribution_lib
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
index b30a3b599bba5ab1207ff3d16178e49a80209cee..de1659aa9f4d0f7d19ec2e8185715573b78eaf2b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_bijective_and_finite
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import PowerTransform
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
index 6f1a6b1cf4b2a8608dda50137ae21d5a6deae606..e4f9d72785c301284812a48c0a67614ca439ffae 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
@@ -21,9 +21,9 @@ from __future__ import print_function
 import numpy as np
 from scipy import special
 
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_bijective_and_finite
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import Sigmoid
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
index 173d52686d62c0e777804fb1f87515918bf16111..62e3869db090e9c9327bc552d10234ff76ba28fd 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_bijective_and_finite
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
index 2c58519fdae8cc22a460caf6ca64b15dddada05e..d9af9aec50d3d69bb10f69f2ffd6ca3a24c316f8 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_bijective_and_finite
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(42)
@@ -41,6 +41,12 @@ class SoftplusBijectorTest(test.TestCase):
     """Inverse log det jacobian, before being reduced."""
     return -np.log(1 - np.exp(-y))
 
+  def testHingeSoftnessZeroRaises(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=0., validate_args=True)
+      with self.assertRaisesOpError("must be non-zero"):
+        bijector.forward([1., 1.]).eval()
+
   def testBijectorForwardInverseEventDimsZero(self):
     with self.test_session():
       bijector = Softplus(event_ndims=0)
@@ -51,6 +57,15 @@ class SoftplusBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
 
+  def testBijectorForwardInverseWithHingeSoftnessEventDimsZero(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=1.5)
+      x = 2 * rng.randn(2, 10)
+      y = 1.5 * self._softplus(x / 1.5)
+
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+
   def testBijectorLogDetJacobianEventDimsZero(self):
     with self.test_session():
       bijector = Softplus(event_ndims=0)
@@ -85,6 +100,18 @@ class SoftplusBijectorTest(test.TestCase):
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
+  def testScalarCongruencyWithPositiveHingeSoftness(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=1.3)
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=2.)
+
+  def testScalarCongruencyWithNegativeHingeSoftness(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=-1.3)
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=2.)
+
   def testBijectiveAndFinite32bit(self):
     with self.test_session():
       bijector = Softplus(event_ndims=0)
@@ -93,6 +120,22 @@ class SoftplusBijectorTest(test.TestCase):
       assert_bijective_and_finite(
           bijector, x, y, rtol=1e-2, atol=1e-2)
 
+  def testBijectiveAndFiniteWithPositiveHingeSoftness32Bit(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=1.23)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = np.logspace(-10, 10, 100).astype(np.float32)
+      assert_bijective_and_finite(
+          bijector, x, y, rtol=1e-2, atol=1e-2)
+
+  def testBijectiveAndFiniteWithNegativeHingeSoftness32Bit(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=-0.7)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = -np.logspace(-10, 10, 100).astype(np.float32)
+      assert_bijective_and_finite(
+          bijector, x, y, rtol=1e-2, atol=1e-2)
+
   def testBijectiveAndFinite16bit(self):
     with self.test_session():
       bijector = Softplus(event_ndims=0)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index 2b28392d35d25617a91d36035330e4778dab8ab0..58368d92c4efc14c7573afcda112f7065a8da8fc 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -24,9 +24,11 @@ import numpy as np
 from scipy import special
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.linalg.python.ops import linear_operator_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -128,6 +130,80 @@ class AssertCloseTest(test.TestCase):
           array_ops.identity(w).eval(feed_dict=feed_dict)
 
 
+class ShapesFromLocAndScaleTest(test.TestCase):
+
+  def test_static_loc_static_scale_non_matching_event_size_raises(self):
+    loc = constant_op.constant(np.zeros((2, 4)))
+    scale = linear_operator_diag.LinearOperatorDiag(np.ones((5, 1, 3)))
+    with self.assertRaisesRegexp(ValueError, "could not be broadcast"):
+      distribution_util.shapes_from_loc_and_scale(loc, scale)
+
+  def test_static_loc_static_scale(self):
+    loc = constant_op.constant(np.zeros((2, 3)))
+    scale = linear_operator_diag.LinearOperatorDiag(np.ones((5, 1, 3)))
+    batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+        loc, scale)
+
+    self.assertEqual(tensor_shape.TensorShape([5, 2]), batch_shape)
+    self.assertEqual(tensor_shape.TensorShape([3]), event_shape)
+
+  def test_static_loc_dynamic_scale(self):
+    loc = constant_op.constant(np.zeros((2, 3)))
+    diag = array_ops.placeholder(dtypes.float64)
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session() as sess:
+      batch_shape, event_shape = sess.run(
+          distribution_util.shapes_from_loc_and_scale(loc, scale),
+          feed_dict={diag: np.ones((5, 1, 3))})
+      self.assertAllEqual([5, 2], batch_shape)
+      self.assertAllEqual([3], event_shape)
+
+  def test_dynamic_loc_static_scale(self):
+    loc = array_ops.placeholder(dtypes.float64)
+    diag = constant_op.constant(np.ones((5, 2, 3)))
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session():
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
+      # batch_shape depends on both args, and so is dynamic.  Since loc did not
+      # have static shape, we infered event shape entirely from scale, and this
+      # is available statically.
+      self.assertAllEqual(
+          [5, 2], batch_shape.eval(feed_dict={loc: np.zeros((2, 3))}))
+      self.assertAllEqual([3], event_shape)
+
+  def test_dynamic_loc_dynamic_scale(self):
+    loc = array_ops.placeholder(dtypes.float64)
+    diag = array_ops.placeholder(dtypes.float64)
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session() as sess:
+      batch_shape, event_shape = sess.run(
+          distribution_util.shapes_from_loc_and_scale(loc, scale),
+          feed_dict={diag: np.ones((5, 2, 3)), loc: np.zeros((2, 3))})
+      self.assertAllEqual([5, 2], batch_shape)
+      self.assertAllEqual([3], event_shape)
+
+  def test_none_loc_static_scale(self):
+    loc = None
+    scale = linear_operator_diag.LinearOperatorDiag(np.ones((5, 1, 3)))
+    batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+        loc, scale)
+
+    self.assertEqual(tensor_shape.TensorShape([5, 1]), batch_shape)
+    self.assertEqual(tensor_shape.TensorShape([3]), event_shape)
+
+  def test_none_loc_dynamic_scale(self):
+    loc = None
+    diag = array_ops.placeholder(dtypes.float64)
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session() as sess:
+      batch_shape, event_shape = sess.run(
+          distribution_util.shapes_from_loc_and_scale(loc, scale),
+          feed_dict={diag: np.ones((5, 1, 3))})
+      self.assertAllEqual([5, 1], batch_shape)
+      self.assertAllEqual([3], event_shape)
+
+
 class GetLogitsAndProbsTest(test.TestCase):
 
   def testGetLogitsAndProbsImproperArguments(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
index 7d7560c3f5954bf585e6fcb0f7df9a95c0d60f7a..eb9028e5df0af5e3f6a2adb719fc0200dc65f01c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 from scipy import stats
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.contrib.distributions.python.ops import logistic
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
index 29cb3eb9b0d7636e16d39493d896f35af455d872..a924d2e383419702471609e14e49f7e52ea34ad9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
@@ -145,8 +145,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
     true_covariance = np.matmul(true_scale, true_scale.T)
     true_variance = np.diag(true_covariance)
     true_stddev = np.sqrt(true_variance)
-    true_det_covariance = np.linalg.det(true_covariance)
-    true_log_det_covariance = np.log(true_det_covariance)
 
     with self.test_session() as sess:
       dist = ds.MultivariateNormalDiagPlusLowRank(
@@ -185,19 +183,19 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
 
       sample_kl_identity = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_identity.log_prob(samps), 0)
-      analytical_kl_identity = ds.kl(dist, mvn_identity)
+      analytical_kl_identity = ds.kl_divergence(dist, mvn_identity)
 
       sample_kl_scaled = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_scaled.log_prob(samps), 0)
-      analytical_kl_scaled = ds.kl(dist, mvn_scaled)
+      analytical_kl_scaled = ds.kl_divergence(dist, mvn_scaled)
 
       sample_kl_diag = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_diag.log_prob(samps), 0)
-      analytical_kl_diag = ds.kl(dist, mvn_diag)
+      analytical_kl_diag = ds.kl_divergence(dist, mvn_diag)
 
       sample_kl_chol = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_chol.log_prob(samps), 0)
-      analytical_kl_chol = ds.kl(dist, mvn_chol)
+      analytical_kl_chol = ds.kl_divergence(dist, mvn_chol)
 
       n = int(10e3)
       baseline = ds.MultivariateNormalDiag(
@@ -208,19 +206,21 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
 
       sample_kl_identity_diag_baseline = math_ops.reduce_mean(
           baseline.log_prob(samps) - mvn_identity.log_prob(samps), 0)
-      analytical_kl_identity_diag_baseline = ds.kl(baseline, mvn_identity)
+      analytical_kl_identity_diag_baseline = ds.kl_divergence(
+          baseline, mvn_identity)
 
       sample_kl_scaled_diag_baseline = math_ops.reduce_mean(
           baseline.log_prob(samps) - mvn_scaled.log_prob(samps), 0)
-      analytical_kl_scaled_diag_baseline = ds.kl(baseline, mvn_scaled)
+      analytical_kl_scaled_diag_baseline = ds.kl_divergence(
+          baseline, mvn_scaled)
 
       sample_kl_diag_diag_baseline = math_ops.reduce_mean(
           baseline.log_prob(samps) - mvn_diag.log_prob(samps), 0)
-      analytical_kl_diag_diag_baseline = ds.kl(baseline, mvn_diag)
+      analytical_kl_diag_diag_baseline = ds.kl_divergence(baseline, mvn_diag)
 
       sample_kl_chol_diag_baseline = math_ops.reduce_mean(
           baseline.log_prob(samps) - mvn_chol.log_prob(samps), 0)
-      analytical_kl_chol_diag_baseline = ds.kl(baseline, mvn_chol)
+      analytical_kl_chol_diag_baseline = ds.kl_divergence(baseline, mvn_chol)
 
       [
           sample_mean_,
@@ -229,8 +229,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
           analytical_covariance_,
           analytical_variance_,
           analytical_stddev_,
-          analytical_log_det_covariance_,
-          analytical_det_covariance_,
           scale_,
           sample_kl_identity_, analytical_kl_identity_,
           sample_kl_scaled_, analytical_kl_scaled_,
@@ -248,8 +246,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
           dist.covariance(),
           dist.variance(),
           dist.stddev(),
-          dist.log_det_covariance(),
-          dist.det_covariance(),
           scale,
           sample_kl_identity, analytical_kl_identity,
           sample_kl_scaled, analytical_kl_scaled,
@@ -264,8 +260,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
 
       sample_variance_ = np.diag(sample_covariance_)
       sample_stddev_ = np.sqrt(sample_variance_)
-      sample_det_covariance_ = np.linalg.det(sample_covariance_)
-      sample_log_det_covariance_ = np.log(sample_det_covariance_)
 
       logging.vlog(2, "true_mean:\n{}  ".format(true_mean))
       logging.vlog(2, "sample_mean:\n{}".format(sample_mean_))
@@ -284,20 +278,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
       logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_))
       logging.vlog(2, "analytical_stddev:\n{}".format(analytical_stddev_))
 
-      logging.vlog(2, "true_log_det_covariance:\n{}".format(
-          true_log_det_covariance))
-      logging.vlog(2, "sample_log_det_covariance:\n{}".format(
-          sample_log_det_covariance_))
-      logging.vlog(2, "analytical_log_det_covariance:\n{}".format(
-          analytical_log_det_covariance_))
-
-      logging.vlog(2, "true_det_covariance:\n{}".format(
-          true_det_covariance))
-      logging.vlog(2, "sample_det_covariance:\n{}".format(
-          sample_det_covariance_))
-      logging.vlog(2, "analytical_det_covariance:\n{}".format(
-          analytical_det_covariance_))
-
       logging.vlog(2, "true_scale:\n{}".format(true_scale))
       logging.vlog(2, "scale:\n{}".format(scale_))
 
@@ -351,17 +331,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
       self.assertAllClose(true_stddev, analytical_stddev_,
                           atol=0., rtol=1e-6)
 
-      self.assertAllClose(true_log_det_covariance, sample_log_det_covariance_,
-                          atol=0., rtol=0.02)
-      self.assertAllClose(true_log_det_covariance,
-                          analytical_log_det_covariance_,
-                          atol=0., rtol=1e-6)
-
-      self.assertAllClose(true_det_covariance, sample_det_covariance_,
-                          atol=0., rtol=0.02)
-      self.assertAllClose(true_det_covariance, analytical_det_covariance_,
-                          atol=0., rtol=1e-5)
-
       self.assertAllClose(true_scale, scale_,
                           atol=0., rtol=1e-6)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
index 406cd4ebbea3d74996b3c0c2316adcc363d3102e..3f4582eb7ee1319684a9209465046bb241337f9d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
@@ -103,6 +103,14 @@ class MultivariateNormalDiagTest(test.TestCase):
       self.assertAllClose(cov_mat, np.cov(samps.T),
                           atol=0.05, rtol=0.05)
 
+  def testSingularScaleRaises(self):
+    mu = [-1., 1]
+    diag = [1., 0]
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+      with self.assertRaisesOpError("Singular"):
+        dist.sample().eval()
+
   def testSampleWithBroadcastScale(self):
     # mu corresponds to a 2-batch of 3-variate normals
     mu = np.zeros([2, 3])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dfab46ebe47b7ed6bccda59fd2f3f9cfd438479
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
@@ -0,0 +1,169 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultivariateNormalFullCovariance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import stats
+from tensorflow.contrib import distributions
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+ds = distributions
+rng = np.random.RandomState(42)
+
+
+class MultivariateNormalFullCovarianceTest(test.TestCase):
+
+  def _random_pd_matrix(self, *shape):
+    mat = rng.rand(*shape)
+    chol = ds.matrix_diag_transform(mat, transform=nn_ops.softplus)
+    chol = array_ops.matrix_band_part(chol, -1, 0)
+    return math_ops.matmul(chol, chol, adjoint_b=True).eval()
+
+  def testRaisesIfInitializedWithNonSymmetricMatrix(self):
+    with self.test_session():
+      mu = [1., 2.]
+      sigma = [[1., 0.], [1., 1.]]  # Nonsingular, but not symmetric
+      mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
+      with self.assertRaisesOpError("not symmetric"):
+        mvn.covariance().eval()
+
+  def testDoesNotRaiseIfInitializedWithSymmetricMatrix(self):
+    with self.test_session():
+      mu = rng.rand(10)
+      sigma = self._random_pd_matrix(10, 10)
+      mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
+      # Should not raise
+      mvn.covariance().eval()
+
+  def testLogPDFScalarBatch(self):
+    with self.test_session():
+      mu = rng.rand(2)
+      sigma = self._random_pd_matrix(2, 2)
+      mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
+      x = rng.rand(2)
+
+      log_pdf = mvn.log_prob(x)
+      pdf = mvn.prob(x)
+
+      scipy_mvn = stats.multivariate_normal(mean=mu, cov=sigma)
+
+      expected_log_pdf = scipy_mvn.logpdf(x)
+      expected_pdf = scipy_mvn.pdf(x)
+      self.assertEqual((), log_pdf.get_shape())
+      self.assertEqual((), pdf.get_shape())
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(expected_pdf, pdf.eval())
+
+  def testLogPDFScalarBatchCovarianceNotProvided(self):
+    with self.test_session():
+      mu = rng.rand(2)
+      mvn = ds.MultivariateNormalFullCovariance(
+          mu, covariance_matrix=None, validate_args=True)
+      x = rng.rand(2)
+
+      log_pdf = mvn.log_prob(x)
+      pdf = mvn.prob(x)
+
+      # Initialize a scipy_mvn with the default covariance.
+      scipy_mvn = stats.multivariate_normal(mean=mu, cov=np.eye(2))
+
+      expected_log_pdf = scipy_mvn.logpdf(x)
+      expected_pdf = scipy_mvn.pdf(x)
+      self.assertEqual((), log_pdf.get_shape())
+      self.assertEqual((), pdf.get_shape())
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(expected_pdf, pdf.eval())
+
+  def testShapes(self):
+    with self.test_session():
+      mu = rng.rand(3, 5, 2)
+      covariance = self._random_pd_matrix(3, 5, 2, 2)
+
+      mvn = ds.MultivariateNormalFullCovariance(
+          mu, covariance, validate_args=True)
+
+      # Shapes known at graph construction time.
+      self.assertEqual((2,), tuple(mvn.event_shape.as_list()))
+      self.assertEqual((3, 5), tuple(mvn.batch_shape.as_list()))
+
+      # Shapes known at runtime.
+      self.assertEqual((2,), tuple(mvn.event_shape_tensor().eval()))
+      self.assertEqual((3, 5), tuple(mvn.batch_shape_tensor().eval()))
+
+  def _random_mu_and_sigma(self, batch_shape, event_shape):
+    # This ensures sigma is positive def.
+    mat_shape = batch_shape + event_shape + event_shape
+    mat = rng.randn(*mat_shape)
+    perm = np.arange(mat.ndim)
+    perm[-2:] = [perm[-1], perm[-2]]
+    sigma = np.matmul(mat, np.transpose(mat, perm))
+
+    mu_shape = batch_shape + event_shape
+    mu = rng.randn(*mu_shape)
+
+    return mu, sigma
+
+  def testKLBatch(self):
+    batch_shape = (2,)
+    event_shape = (3,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = ds.MultivariateNormalFullCovariance(
+          loc=mu_a,
+          covariance_matrix=sigma_a,
+          validate_args=True)
+      mvn_b = ds.MultivariateNormalFullCovariance(
+          loc=mu_b,
+          covariance_matrix=sigma_b,
+          validate_args=True)
+
+      kl = ds.kl_divergence(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(mu_a[0, :], sigma_a[0, :, :],
+                                            mu_b[0, :], sigma_b[0, :])
+      expected_kl_1 = _compute_non_batch_kl(mu_a[1, :], sigma_a[1, :, :],
+                                            mu_b[1, :], sigma_b[1, :])
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
+
+def _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b):
+  """Non-batch KL for N(mu_a, sigma_a), N(mu_b, sigma_b)."""
+  # Check using numpy operations
+  # This mostly repeats the tensorflow code _kl_mvn_mvn(), but in numpy.
+  # So it is important to also check that KL(mvn, mvn) = 0.
+  sigma_b_inv = np.linalg.inv(sigma_b)
+
+  t = np.trace(sigma_b_inv.dot(sigma_a))
+  q = (mu_b - mu_a).dot(sigma_b_inv).dot(mu_b - mu_a)
+  k = mu_a.shape[0]
+  l = np.log(np.linalg.det(sigma_b) / np.linalg.det(sigma_a))
+
+  return 0.5 * (t + q - k + l)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
index dd7283bb57ff41f580a0f400cc225722da574e3f..685f32883dae5b8513badeb05e1508cd611d6e93 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
@@ -151,6 +151,14 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(sample_values.mean(axis=0), mu, atol=1e-2)
       self.assertAllClose(np.cov(sample_values, rowvar=0), sigma, atol=0.06)
 
+  def testSingularScaleRaises(self):
+    with self.test_session():
+      mu = None
+      chol = [[1., 0.], [0., 0.]]
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      with self.assertRaisesOpError("Singular operator"):
+        mvn.sample().eval()
+
   def testSampleWithSampleShape(self):
     with self.test_session():
       mu = self._rng.rand(3, 5, 2)
@@ -241,7 +249,7 @@ class MultivariateNormalTriLTest(test.TestCase):
           scale_tril=np.linalg.cholesky(sigma_b),
           validate_args=True)
 
-      kl = ds.kl(mvn_a, mvn_b)
+      kl = ds.kl_divergence(mvn_a, mvn_b)
       self.assertEqual(batch_shape, kl.get_shape())
 
       kl_v = kl.eval()
@@ -263,7 +271,7 @@ class MultivariateNormalTriLTest(test.TestCase):
           scale_tril=np.linalg.cholesky(sigma_b),
           validate_args=True)
 
-      kl = ds.kl(mvn_a, mvn_b)
+      kl = ds.kl_divergence(mvn_a, mvn_b)
       self.assertEqual(batch_shape, kl.get_shape())
 
       kl_v = kl.eval()
@@ -285,7 +293,7 @@ class MultivariateNormalTriLTest(test.TestCase):
           validate_args=True)
 
       # Should be zero since KL(p || p) = =.
-      kl = ds.kl(mvn_a, mvn_a)
+      kl = ds.kl_divergence(mvn_a, mvn_a)
       self.assertEqual(batch_shape, kl.get_shape())
 
       kl_v = kl.eval()
@@ -300,8 +308,6 @@ class MultivariateNormalTriLTest(test.TestCase):
     true_covariance = np.matmul(true_scale, true_scale.T)
     true_variance = np.diag(true_covariance)
     true_stddev = np.sqrt(true_variance)
-    true_det_covariance = np.linalg.det(true_covariance)
-    true_log_det_covariance = np.log(true_det_covariance)
 
     with self.test_session() as sess:
       dist = ds.MultivariateNormalTriL(
@@ -323,7 +329,7 @@ class MultivariateNormalTriLTest(test.TestCase):
 
       sample_kl_chol = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_chol.log_prob(samps), 0)
-      analytical_kl_chol = ds.kl(dist, mvn_chol)
+      analytical_kl_chol = ds.kl_divergence(dist, mvn_chol)
 
       scale = dist.scale.to_dense()
 
@@ -334,8 +340,6 @@ class MultivariateNormalTriLTest(test.TestCase):
           analytical_covariance_,
           analytical_variance_,
           analytical_stddev_,
-          analytical_log_det_covariance_,
-          analytical_det_covariance_,
           sample_kl_chol_, analytical_kl_chol_,
           scale_,
       ] = sess.run([
@@ -345,16 +349,12 @@ class MultivariateNormalTriLTest(test.TestCase):
           dist.covariance(),
           dist.variance(),
           dist.stddev(),
-          dist.log_det_covariance(),
-          dist.det_covariance(),
           sample_kl_chol, analytical_kl_chol,
           scale,
       ])
 
       sample_variance_ = np.diag(sample_covariance_)
       sample_stddev_ = np.sqrt(sample_variance_)
-      sample_det_covariance_ = np.linalg.det(sample_covariance_)
-      sample_log_det_covariance_ = np.log(sample_det_covariance_)
 
       logging.vlog(2, "true_mean:\n{}  ".format(true_mean))
       logging.vlog(2, "sample_mean:\n{}".format(sample_mean_))
@@ -373,21 +373,6 @@ class MultivariateNormalTriLTest(test.TestCase):
       logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_))
       logging.vlog(2, "analytical_stddev:\n{}".format(analytical_stddev_))
 
-      logging.vlog(
-          2, "true_log_det_covariance:\n{}".format(true_log_det_covariance))
-      logging.vlog(
-          2, "sample_log_det_covariance:\n{}".format(
-              sample_log_det_covariance_))
-      logging.vlog(2, "analytical_log_det_covariance:\n{}".format(
-          analytical_log_det_covariance_))
-
-      logging.vlog(2, "true_det_covariance:\n{}".format(true_det_covariance))
-      logging.vlog(
-          2, "sample_det_covariance:\n{}".format(sample_det_covariance_))
-      logging.vlog(
-          2, "analytical_det_covariance:\n{}".format(
-              analytical_det_covariance_))
-
       logging.vlog(2, "true_scale:\n{}".format(true_scale))
       logging.vlog(2, "scale:\n{}".format(scale_))
 
@@ -414,17 +399,6 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(true_stddev, analytical_stddev_,
                           atol=0., rtol=1e-6)
 
-      self.assertAllClose(true_log_det_covariance, sample_log_det_covariance_,
-                          atol=0., rtol=0.04)
-      self.assertAllClose(true_log_det_covariance,
-                          analytical_log_det_covariance_,
-                          atol=0., rtol=1e-6)
-
-      self.assertAllClose(true_det_covariance, sample_det_covariance_,
-                          atol=0., rtol=0.03)
-      self.assertAllClose(true_det_covariance, analytical_det_covariance_,
-                          atol=0., rtol=1e-6)
-
       self.assertAllClose(true_scale, scale_,
                           atol=0., rtol=1e-6)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py b/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
index 56ad4a081bc2306e6d2e147caf4feda80292d988..111f88eeb50fa9ef134dbe30d4a0be0eec7a0d26 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.contrib.distributions.python.ops import onehot_categorical
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -27,6 +26,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
 
 
@@ -178,8 +178,8 @@ class OneHotCategoricalTest(test.TestCase):
           kl_expected = np.sum(
               prob_p * (np.log(prob_p) - np.log(prob_q)), axis=-1)
 
-          kl_actual = kullback_leibler.kl(p, q)
-          kl_same = kullback_leibler.kl(p, p)
+          kl_actual = kullback_leibler.kl_divergence(p, q)
+          kl_same = kullback_leibler.kl_divergence(p, p)
           x = p.sample(int(2e4), seed=0)
           x = math_ops.cast(x, dtype=dtypes.float32)
           # Compute empirical KL(p||q).
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
index 49ece78b0d2e169ce5d73260e9220e0277a305fb..6549992633dcc384f26950f4c80ade60f337b78d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
@@ -19,16 +19,14 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib import distributions
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.platform import test
 
-distributions = distributions_lib
-
 
 def softplus(x):
   return np.log(1 + np.exp(x))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c355adeedbfff1072281a81de726ddb0ece07882
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
@@ -0,0 +1,215 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VectorLaplaceLinearOperator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib import distributions
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+ds = distributions
+
+
+class VectorLaplaceDiagTest(test.TestCase):
+  """Well tested because this is a simple override of the base class."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def testScalarParams(self):
+    mu = -1.
+    diag = -5.
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "at least 1 dimension"):
+        ds.VectorLaplaceDiag(mu, diag)
+
+  def testVectorParams(self):
+    mu = [-1.]
+    diag = [-5.]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([3, 1], dist.sample(3).get_shape())
+
+  def testDistWithBatchShapeOneThenTransformedThroughSoftplus(self):
+    # This complex combination of events resulted in a loss of static shape
+    # information when tensor_util.constant_value(self._needs_rotation) was
+    # being used incorrectly (resulting in always rotating).
+    # Batch shape = [1], event shape = [3]
+    mu = array_ops.zeros((1, 3))
+    diag = array_ops.ones((1, 3))
+    with self.test_session():
+      base_dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      dist = ds.TransformedDistribution(
+          base_dist,
+          validate_args=True,
+          bijector=bijectors.Softplus(event_ndims=1))
+      samps = dist.sample(5)  # Shape [5, 1, 3].
+      self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
+
+  def testMean(self):
+    mu = [-1., 1]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      self.assertAllEqual(mu, dist.mean().eval())
+
+  def testMeanWithBroadcastLoc(self):
+    mu = [-1.]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([-1., -1.], dist.mean().eval())
+
+  def testSample(self):
+    mu = [-1., 1]
+    diag = [1., -2]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      samps = dist.sample(int(1e4), seed=0).eval()
+      cov_mat = 2. * array_ops.matrix_diag(diag).eval()**2
+
+      self.assertAllClose(mu, samps.mean(axis=0),
+                          atol=0., rtol=0.05)
+      self.assertAllClose(cov_mat, np.cov(samps.T),
+                          atol=0.05, rtol=0.05)
+
+  def testSingularScaleRaises(self):
+    mu = [-1., 1]
+    diag = [1., 0]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      with self.assertRaisesOpError("Singular"):
+        dist.sample().eval()
+
+  def testSampleWithBroadcastScale(self):
+    # mu corresponds to a 2-batch of 3-variate normals
+    mu = np.zeros([2, 3])
+
+    # diag corresponds to no batches of 3-variate normals
+    diag = np.ones([3])
+
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+
+      mean = dist.mean()
+      self.assertAllEqual([2, 3], mean.get_shape())
+      self.assertAllClose(mu, mean.eval())
+
+      n = int(1e4)
+      samps = dist.sample(n, seed=0).eval()
+      cov_mat = 2. * array_ops.matrix_diag(diag).eval()**2
+      sample_cov = np.matmul(samps.transpose([1, 2, 0]),
+                             samps.transpose([1, 0, 2])) / n
+
+      self.assertAllClose(mu, samps.mean(axis=0),
+                          atol=0.10, rtol=0.05)
+      self.assertAllClose([cov_mat, cov_mat], sample_cov,
+                          atol=0.10, rtol=0.05)
+
+  def testCovariance(self):
+    with self.test_session():
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          2. * np.diag(np.ones([3], dtype=np.float32)),
+          vla.covariance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllEqual([2], vla.batch_shape)
+      self.assertAllEqual([3], vla.event_shape)
+      self.assertAllClose(
+          2. * np.array([[[3., 0, 0],
+                          [0, 3, 0],
+                          [0, 0, 3]],
+                         [[2, 0, 0],
+                          [0, 2, 0],
+                          [0, 0, 2]]])**2.,
+          vla.covariance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllEqual([2], vla.batch_shape)
+      self.assertAllEqual([3], vla.event_shape)
+      self.assertAllClose(
+          2. * np.array([[[3., 0, 0],
+                          [0, 2, 0],
+                          [0, 0, 1]],
+                         [[4, 0, 0],
+                          [0, 5, 0],
+                          [0, 0, 6]]])**2.,
+          vla.covariance().eval())
+
+  def testVariance(self):
+    with self.test_session():
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          2. * np.ones([3], dtype=np.float32),
+          vla.variance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          2. * np.array([[3., 3, 3],
+                         [2, 2, 2]])**2.,
+          vla.variance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1],
+                      [4, 5, 6]])
+      self.assertAllClose(
+          2. * np.array([[3., 2, 1],
+                         [4, 5, 6]])**2.,
+          vla.variance().eval())
+
+  def testStddev(self):
+    with self.test_session():
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          np.sqrt(2) * np.ones([3], dtype=np.float32),
+          vla.stddev().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          np.sqrt(2) * np.array([[3., 3, 3],
+                                 [2, 2, 2]]),
+          vla.stddev().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllClose(
+          np.sqrt(2) * np.array([[3., 2, 1],
+                                 [4, 5, 6]]),
+          vla.stddev().eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index db0370560823d81a0cf431a662a0708c33602c76..1684a5fffe13fa8a074ae7ede0182a9d145300c7 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -39,12 +39,10 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops.bijectors.affine import *
 from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import *
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector import *
 from tensorflow.contrib.distributions.python.ops.bijectors.chain import *
 from tensorflow.contrib.distributions.python.ops.bijectors.cholesky_outer_product import *
 from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector import *
 from tensorflow.contrib.distributions.python.ops.bijectors.exp import *
-from tensorflow.contrib.distributions.python.ops.bijectors.identity import *
 from tensorflow.contrib.distributions.python.ops.bijectors.inline import *
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
@@ -52,6 +50,8 @@ from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import *
+from tensorflow.python.ops.distributions.bijector import *
+from tensorflow.python.ops.distributions.identity_bijector import Identity
 
 # pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
index 429f7eac4d56d5d6f74b564dfd554322e7f6e008..d44e258bd280b10b694211b4b536a98b13a7f431 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
@@ -22,7 +22,6 @@ from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.contrib.distributions.python.ops import operator_pd_diag
 from tensorflow.contrib.distributions.python.ops import operator_pd_identity
 from tensorflow.contrib.distributions.python.ops import operator_pd_vdvt_update
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +31,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
index a8e93bd2c761c430c5ce0415d1ba1240940c4182..ae380b5cb2bc39e06aa1e187c134d7e92f6cd92f 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
 from tensorflow.contrib.linalg.python.ops import linear_operator
 from tensorflow.python.framework import constant_op
@@ -27,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -193,7 +193,7 @@ class AffineLinearOperator(bijector.Bijector):
           y, expand_batch_dim=False)
       with ops.control_dependencies(self._maybe_collect_assertions() if
                                     self.validate_args else []):
-        y = self.scale.apply(y)
+        y = self.scale.matmul(y)
       y = self._shaper.undo_make_batch_of_event_sample_matrices(
           y, sample_shape, expand_batch_dim=False)
     if self.shift is not None:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
index 0b72c5aadfe26b0c856b302864730f131505a947..defa36a14048d35c6264c7227840ed70dcc77cbb 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import itertools
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
index e605aec9c3590fe06ccd54e99c1de59f8ec5f7eb..dc05b2f611a52dc29717c69df77a1576aa6b5693 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -29,6 +27,8 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
index a1b2aef830927124b26ab195f8afca7c1258fef2..ccb1f029277bc07011df7be047a075274f2b3a27 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = ["ConditionalBijector"]
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
index 1f9ec0b1718a4db9f8f4062e0bd406251f600682..fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
index 73d2162ac3c93a1b1142718ab11abebe256353f8..7f28a298572642e9ced7c0b88f9601a0d1751141 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
 __all__ = [
     "Invert",
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
index 9963d8a7fe1786c44c50de707516b171f64d1066..c37db61720d10949f294ff7b2e9778ba6efa57f0 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
index b8d8152ce3817a28bd1db3952a54aac9afbd27e4..a640dfe7dfbcce96261589c7fc49107deaefdd54 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
index 87a5aca1d97d7a8d0a8991883d8e7987b842a947..8645cc1b6b04be75a419342591272f07a4a1711c 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -30,6 +29,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
index a1e72ef7f61f899b78dd63e8a0daa35a8c3c9b13..81957fcf78922fa15fd20a25d144071f431161ae 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
@@ -18,10 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -38,6 +41,22 @@ class Softplus(bijector.Bijector):
   * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
     the `Exp` `Bijector`.
 
+  The optional nonzero `hinge_softness` parameter changes the transition at
+  zero.  With `hinge_softness = c`, the bijector is:
+
+    ```f_c(x) := c * g(x / c) = c * Log[1 + exp(x / c)].```
+
+  For large `x >> 1`, `c * Log[1 + exp(x / c)] approx c * Log[exp(x / c)] = x`,
+  so the behavior for large `x` is the same as the standard softplus.
+
+  As `c > 0` approaches 0 from the right, `f_c(x)` becomes less and less soft,
+  approaching `max(0, x)`.
+
+  * `c = 1` is the default.
+  * `c > 0` but small means `f(x) approx ReLu(x) = max(0, x)`.
+  * `c < 0` flips sign and reflects around the `y-axis`: `f_{-c}(x) = -f_c(-x)`.
+  * `c = 0` results in a non-bijective transformation and triggers an exception.
+
     Example Use:
 
     ```python
@@ -45,9 +64,9 @@ class Softplus(bijector.Bijector):
     # batch ndim and 2 event ndims (i.e., vector of matrices).
     softplus = Softplus(event_ndims=2)
     x = [[[1., 2],
-           [3, 4]],
-          [[5, 6],
-           [7, 8]]]
+          [3, 4]],
+         [[5, 6],
+          [7, 8]]]
     log(1 + exp(x)) == softplus.forward(x)
     log(exp(x) - 1) == softplus.inverse(x)
     ```
@@ -56,20 +75,48 @@ class Softplus(bijector.Bijector):
     reduction over the event space.
   """
 
+  @distribution_util.AppendDocstring(
+      kwargs_dict={
+          "hinge_softness": (
+              "Nonzero floating point `Tensor`.  Controls the softness of what "
+              "would otherwise be a kink at the origin.  Default is 1.0")})
   def __init__(self,
                event_ndims=0,
+               hinge_softness=None,
                validate_args=False,
                name="softplus"):
+    with ops.name_scope(name, values=[hinge_softness]):
+      if hinge_softness is not None:
+        self._hinge_softness = ops.convert_to_tensor(
+            hinge_softness, name="hinge_softness")
+      else:
+        self._hinge_softness = None
+      if validate_args:
+        nonzero_check = check_ops.assert_none_equal(
+            ops.convert_to_tensor(
+                0, dtype=self.hinge_softness.dtype),
+            self.hinge_softness,
+            message="hinge_softness must be non-zero")
+        self._hinge_softness = control_flow_ops.with_dependencies(
+            [nonzero_check], self.hinge_softness)
+
     super(Softplus, self).__init__(
         event_ndims=event_ndims,
         validate_args=validate_args,
         name=name)
 
   def _forward(self, x):
-    return nn_ops.softplus(x)
+    if self.hinge_softness is None:
+      return nn_ops.softplus(x)
+    hinge_softness = math_ops.cast(self.hinge_softness, x.dtype)
+    return hinge_softness * nn_ops.softplus(x / hinge_softness)
 
   def _inverse(self, y):
-    return distribution_util.softplus_inverse(y)
+    if self.hinge_softness is None:
+      return distribution_util.softplus_inverse(y)
+    hinge_softness = math_ops.cast(self.hinge_softness, y.dtype)
+    return hinge_softness * distribution_util.softplus_inverse(
+        y / hinge_softness)
 
   def _inverse_log_det_jacobian(self, y):
     # Could also do:
@@ -81,9 +128,17 @@ class Softplus(bijector.Bijector):
     #           = 1 / (1 - exp{-Y}),
     # which is the most stable for large Y > 0. For small Y, we use
     # 1 - exp{-Y} approx Y.
+    if self.hinge_softness is not None:
+      y /= math_ops.cast(self.hinge_softness, y.dtype)
     return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
                                 axis=self._event_dims_tensor(y))
 
   def _forward_log_det_jacobian(self, x):
+    if self.hinge_softness is not None:
+      x /= math_ops.cast(self.hinge_softness, x.dtype)
     return -math_ops.reduce_sum(nn_ops.softplus(-x),
                                 axis=self._event_dims_tensor(x))
+
+  @property
+  def hinge_softness(self):
+    return self._hinge_softness
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index 78eb44acb2c361f947005d71c6388469bf94c083..ecf6a611565ab69a31c40060b46ee96af541e18c 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -27,6 +25,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 _binomial_sample_note = """
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index 45d3accdd6c0c77b698d38eb1e992f8fcce05741..bdd5571c966a74e58e4f9f8eed2628f131a1b92e 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -18,11 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import gamma
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import gamma
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_distribution.py
index e3ca5c5468e64234b283876f369661ee0ffac8ee..ef25d4aedd6a2cd9a342bb5911f4f35fec7b3d74 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_distribution.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class ConditionalDistribution(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 06d0549c35329c5c3255c8cac65b3ed659657be6..2e1e68cf0587b69f055d8d747672d99383f75ed6 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -18,9 +18,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import conditional_distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index 6faa2728426d202a97ccd66273b406ad53f7e24c..850d08d1bd69ebc7661557d648e2bffe77e6a908 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -22,7 +22,6 @@ import abc
 
 import six
 
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution
 
 __all__ = [
     "Deterministic",
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index a0872677a96db7abe158e794f2beed9bbca37156..5e3b42dd2aa5e85fab23820fc63a69be77c3ac27 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -18,619 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-import hashlib
-import math
-import numpy as np
-
 from tensorflow.contrib import linalg
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-
-
-def assert_close(
-    x, y, data=None, summarize=None, message=None, name="assert_close"):
-  """Assert that that x and y are within machine epsilon of each other.
-
-  Args:
-    x: Floating-point `Tensor`
-    y: Floating-point `Tensor`
-    data: The tensors to print out if the condition is `False`. Defaults to
-      error message and first few entries of `x` and `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).
-
-  Returns:
-    Op raising `InvalidArgumentError` if |x - y| > machine epsilon.
-  """
-  message = message or ""
-  x = ops.convert_to_tensor(x, name="x")
-  y = ops.convert_to_tensor(y, name="y")
-
-  if data is None:
-    data = [
-        message,
-        "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
-        y.name, y
-    ]
-
-  if x.dtype.is_integer:
-    return check_ops.assert_equal(
-        x, y, data=data, summarize=summarize, message=message, name=name)
-
-  with ops.name_scope(name, "assert_close", [x, y, data]):
-    tol = np.finfo(x.dtype.as_numpy_dtype).eps
-    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
-    return control_flow_ops.Assert(
-        condition, data, summarize=summarize)
-
-
-def assert_integer_form(
-    x, data=None, summarize=None, message=None, name="assert_integer_form"):
-  """Assert that x has integer components (or floats equal to integers).
-
-  Args:
-    x: Floating-point `Tensor`
-    data: The tensors to print out if the condition is `False`. Defaults to
-      error message and first few entries of `x` and `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).
-
-  Returns:
-    Op raising `InvalidArgumentError` if round(x) != x.
-  """
-
-  message = message or "x has non-integer components"
-  x = ops.convert_to_tensor(x, name="x")
-  casted_x = math_ops.to_int64(x)
-  return check_ops.assert_equal(
-      x, math_ops.cast(math_ops.round(casted_x), x.dtype),
-      data=data, summarize=summarize, message=message, name=name)
-
-
-def assert_symmetric(matrix):
-  matrix_t = array_ops.matrix_transpose(matrix)
-  return control_flow_ops.with_dependencies(
-      [check_ops.assert_equal(matrix, matrix_t)], matrix)
-
-
-def embed_check_nonnegative_discrete(x, check_integer=True):
-  """Assert x is a non-negative tensor, and optionally of integers."""
-  assertions = [check_ops.assert_non_negative(
-      x, message="x must be non-negative.")]
-  if check_integer:
-    assertions += [assert_integer_form(
-        x, message="x cannot contain fractional components.")]
-  return control_flow_ops.with_dependencies(assertions, x)
-
-
-def same_dynamic_shape(a, b):
-  """Returns whether a and b have the same dynamic shape.
-
-  Args:
-    a: `Tensor`
-    b: `Tensor`
-
-  Returns:
-    `bool` `Tensor` representing if both tensors have the same shape.
-  """
-  a = ops.convert_to_tensor(a, name="a")
-  b = ops.convert_to_tensor(b, name="b")
-
-  # Here we can't just do math_ops.equal(a.shape, b.shape), since
-  # static shape inference may break the equality comparison between
-  # shape(a) and shape(b) in math_ops.equal.
-  def all_shapes_equal():
-    return math_ops.reduce_all(math_ops.equal(
-        array_ops.concat([array_ops.shape(a), array_ops.shape(b)], 0),
-        array_ops.concat([array_ops.shape(b), array_ops.shape(a)], 0)))
-
-  # One of the shapes isn't fully defined, so we need to use the dynamic
-  # shape.
-  return control_flow_ops.cond(
-      math_ops.equal(array_ops.rank(a), array_ops.rank(b)),
-      all_shapes_equal,
-      lambda: constant_op.constant(False))
-
-
-def get_logits_and_probs(logits=None,
-                         probs=None,
-                         multidimensional=False,
-                         validate_args=False,
-                         name="get_logits_and_probs"):
-  """Converts logit to probabilities (or vice-versa), and returns both.
-
-  Args:
-    logits: Floating-point `Tensor` representing log-odds.
-    probs: Floating-point `Tensor` representing probabilities.
-    multidimensional: Python `bool`, default `False`.
-      If `True`, represents whether the last dimension of `logits` or `probs`,
-      a `[N1, N2, ...  k]` dimensional tensor, representing the
-      logit or probability of `shape[-1]` classes.
-    validate_args: Python `bool`, default `False`. When `True`, either assert
-      `0 <= probs <= 1` (if not `multidimensional`) or that the last dimension
-      of `probs` sums to one.
-    name: A name for this operation (optional).
-
-  Returns:
-    logits, probs: Tuple of `Tensor`s. If `probs` has an entry that is `0` or
-      `1`, then the corresponding entry in the returned logit will be `-Inf` and
-      `Inf` respectively.
-
-  Raises:
-    ValueError: if neither `probs` nor `logits` were passed in, or both were.
-  """
-  with ops.name_scope(name, values=[probs, logits]):
-    if (probs is None) == (logits is None):
-      raise ValueError("Must pass probs or logits, but not both.")
-
-    if probs is None:
-      logits = ops.convert_to_tensor(logits, name="logits")
-      if multidimensional:
-        return logits, nn.softmax(logits, name="probs")
-      return logits, math_ops.sigmoid(logits, name="probs")
-
-    probs = ops.convert_to_tensor(probs, name="probs")
-    if validate_args:
-      with ops.name_scope("validate_probs"):
-        one = constant_op.constant(1., probs.dtype)
-        dependencies = [check_ops.assert_non_negative(probs)]
-        if multidimensional:
-          dependencies += [assert_close(math_ops.reduce_sum(probs, -1), one,
-                                        message="probs does not sum to 1.")]
-        else:
-          dependencies += [check_ops.assert_less_equal(
-              probs, one, message="probs has components greater than 1.")]
-        probs = control_flow_ops.with_dependencies(dependencies, probs)
-
-    with ops.name_scope("logits"):
-      if multidimensional:
-        # Here we don't compute the multidimensional case, in a manner
-        # consistent with respect to the unidimensional case. We do so
-        # following the TF convention. Typically, you might expect to see
-        # logits = log(probs) - log(probs[pivot]). A side-effect of
-        # being consistent with the TF approach is that the unidimensional case
-        # implicitly handles the second dimension but the multidimensional case
-        # explicitly keeps the pivot dimension.
-        return math_ops.log(probs), probs
-      return math_ops.log(probs) - math_ops.log1p(-1. * probs), probs
-
-
-def log_combinations(n, counts, name="log_combinations"):
-  """Multinomial coefficient.
-
-  Given `n` and `counts`, where `counts` has last dimension `k`, we compute
-  the multinomial coefficient as:
-
-  ```n! / sum_i n_i!```
-
-  where `i` runs over all `k` classes.
-
-  Args:
-    n: Floating-point `Tensor` broadcastable with `counts`. This represents `n`
-      outcomes.
-    counts: Floating-point `Tensor` broadcastable with `n`. This represents
-      counts in `k` classes, where `k` is the last dimension of the tensor.
-    name: A name for this operation (optional).
-
-  Returns:
-    `Tensor` representing the multinomial coefficient between `n` and `counts`.
-  """
-  # First a bit about the number of ways counts could have come in:
-  # E.g. if counts = [1, 2], then this is 3 choose 2.
-  # In general, this is (sum counts)! / sum(counts!)
-  # The sum should be along the last dimension of counts. This is the
-  # "distribution" dimension. Here n a priori represents the sum of counts.
-  with ops.name_scope(name, values=[n, counts]):
-    n = ops.convert_to_tensor(n, name="n")
-    counts = ops.convert_to_tensor(counts, name="counts")
-    total_permutations = math_ops.lgamma(n + 1)
-    counts_factorial = math_ops.lgamma(counts + 1)
-    redundant_permutations = math_ops.reduce_sum(counts_factorial, axis=[-1])
-    return total_permutations - redundant_permutations
-
-
-def matrix_diag_transform(matrix, transform=None, name=None):
-  """Transform diagonal of [batch-]matrix, leave rest of matrix unchanged.
-
-  Create a trainable covariance defined by a Cholesky factor:
-
-  ```python
-  # Transform network layer into 2 x 2 array.
-  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-
-  # Make the diagonal positive. If the upper triangle was zero, this would be a
-  # valid Cholesky factor.
-  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-  # OperatorPDCholesky ignores the upper triangle.
-  operator = OperatorPDCholesky(chol)
-  ```
-
-  Example of heteroskedastic 2-D linear regression.
-
-  ```python
-  # Get a trainable Cholesky factor.
-  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-  # Get a trainable mean.
-  mu = tf.contrib.layers.fully_connected(activations, 2)
-
-  # This is a fully trainable multivariate normal!
-  dist = tf.contrib.distributions.MVNCholesky(mu, chol)
-
-  # Standard log loss. Minimizing this will "train" mu and chol, and then dist
-  # will be a distribution predicting labels as multivariate Gaussians.
-  loss = -1 * tf.reduce_mean(dist.log_prob(labels))
-  ```
-
-  Args:
-    matrix:  Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
-      equal.
-    transform:  Element-wise function mapping `Tensors` to `Tensors`. To
-      be applied to the diagonal of `matrix`. If `None`, `matrix` is returned
-      unchanged. Defaults to `None`.
-    name:  A name to give created ops.
-      Defaults to "matrix_diag_transform".
-
-  Returns:
-    A `Tensor` with same shape and `dtype` as `matrix`.
-  """
-  with ops.name_scope(name, "matrix_diag_transform", [matrix]):
-    matrix = ops.convert_to_tensor(matrix, name="matrix")
-    if transform is None:
-      return matrix
-    # Replace the diag with transformed diag.
-    diag = array_ops.matrix_diag_part(matrix)
-    transformed_diag = transform(diag)
-    transformed_mat = array_ops.matrix_set_diag(matrix, transformed_diag)
-
-  return transformed_mat
-
-
-def rotate_transpose(x, shift, name="rotate_transpose"):
-  """Circularly moves dims left or right.
-
-  Effectively identical to:
-
-  ```python
-  numpy.transpose(x, numpy.roll(numpy.arange(len(x.shape)), shift))
-  ```
-
-  When `validate_args=False` additional graph-runtime checks are
-  performed. These checks entail moving data from to GPU to CPU.
-
-  Example:
-
-    ```python
-    x = ...  # Tensor of shape [1, 2, 3, 4].
-    rotate_transpose(x, -1)  # result shape: [2, 3, 4, 1]
-    rotate_transpose(x, -2)  # result shape: [3, 4, 1, 2]
-    rotate_transpose(x,  1)  # result shape: [4, 1, 2, 3]
-    rotate_transpose(x,  2)  # result shape: [3, 4, 1, 2]
-    rotate_transpose(x, 7) == rotate_transpose(x, 3)
-    rotate_transpose(x, -7) == rotate_transpose(x, -3)
-    ```
-
-  Args:
-    x: `Tensor`.
-    shift: `Tensor`. Number of dimensions to transpose left (shift<0) or
-      transpose right (shift>0).
-    name: Python `str`. The name to give this op.
-
-  Returns:
-    rotated_x: Input `Tensor` with dimensions circularly rotated by shift.
-
-  Raises:
-    TypeError: if shift is not integer type.
-  """
-  with ops.name_scope(name, values=[x, shift]):
-    x = ops.convert_to_tensor(x, name="x")
-    shift = ops.convert_to_tensor(shift, name="shift")
-    # We do not assign back to preserve constant-ness.
-    check_ops.assert_integer(shift)
-    shift_value_static = tensor_util.constant_value(shift)
-    ndims = x.get_shape().ndims
-    if ndims is not None and shift_value_static is not None:
-      if ndims < 2: return x
-      shift_value_static = np.sign(shift_value_static) * (
-          abs(shift_value_static) % ndims)
-      if shift_value_static == 0: return x
-      perm = np.roll(np.arange(ndims), shift_value_static)
-      return array_ops.transpose(x, perm=perm)
-    else:
-      # Consider if we always had a positive shift, and some specified
-      # direction.
-      # When shifting left we want the new array:
-      #   last(x, n-shift) + first(x, shift)
-      # and if shifting right then we want:
-      #   last(x, shift) + first(x, n-shift)
-      # Observe that last(a) == slice(a, n) and first(a) == slice(0, a).
-      # Also, we can encode direction and shift as one: direction * shift.
-      # Combining these facts, we have:
-      #   a = cond(shift<0, -shift, n-shift)
-      #   last(x, n-a) + first(x, a) == x[a:n] + x[0:a]
-      # Finally, we transform shift by modulo length so it can be specified
-      # independently from the array upon which it operates (like python).
-      ndims = array_ops.rank(x)
-      shift = array_ops.where(math_ops.less(shift, 0),
-                              math_ops.mod(-shift, ndims),
-                              ndims - math_ops.mod(shift, ndims))
-      first = math_ops.range(0, shift)
-      last = math_ops.range(shift, ndims)
-      perm = array_ops.concat([last, first], 0)
-      return array_ops.transpose(x, perm=perm)
-
-
-def pick_vector(cond,
-                true_vector,
-                false_vector,
-                name="pick_vector"):
-  """Picks possibly different length row `Tensor`s based on condition.
-
-  Value `Tensor`s should have exactly one dimension.
-
-  If `cond` is a python Boolean or `tf.constant` then either `true_vector` or
-  `false_vector` is immediately returned. I.e., no graph nodes are created and
-  no validation happens.
-
-  Args:
-    cond: `Tensor`. Must have `dtype=tf.bool` and be scalar.
-    true_vector: `Tensor` of one dimension. Returned when cond is `True`.
-    false_vector: `Tensor` of one dimension. Returned when cond is `False`.
-    name: Python `str`. The name to give this op.
-
-  Example:
-
-  ```python
-  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))
-  # result is tensor: [10, 11].
-  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))
-  # result is tensor: [15, 16, 17].
-  ```
-
-  Returns:
-    true_or_false_vector: `Tensor`.
-
-  Raises:
-    TypeError: if `cond.dtype != tf.bool`
-    TypeError: if `cond` is not a constant and
-      `true_vector.dtype != false_vector.dtype`
-  """
-  with ops.name_scope(name, values=(cond, true_vector, false_vector)):
-    cond = ops.convert_to_tensor(cond, name="cond")
-    if cond.dtype != dtypes.bool:
-      raise TypeError("%s.dtype=%s which is not %s" %
-                      (cond.name, cond.dtype, dtypes.bool))
-    cond_value_static = tensor_util.constant_value(cond)
-    if cond_value_static is not None:
-      return true_vector if cond_value_static else false_vector
-    true_vector = ops.convert_to_tensor(true_vector, name="true_vector")
-    false_vector = ops.convert_to_tensor(false_vector, name="false_vector")
-    if true_vector.dtype != false_vector.dtype:
-      raise TypeError(
-          "%s.dtype=%s does not match %s.dtype=%s"
-          % (true_vector.name, true_vector.dtype,
-             false_vector.name, false_vector.dtype))
-    n = array_ops.shape(true_vector)[0]
-    return array_ops.slice(
-        array_ops.concat([true_vector, false_vector], 0),
-        [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
-
-
-def gen_new_seed(seed, salt):
-  """Generate a new seed, from the given seed and salt."""
-  if seed is None:
-    return None
-  string = (str(seed) + salt).encode("utf-8")
-  return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
-
-
-def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
-  """Creates a (batch of) lower triangular matrix from a vector of inputs.
-
-  If `x.get_shape()` is `[b1, b2, ..., bK, d]` then the output shape is `[b1,
-  b2, ..., bK, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
-  `n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))`.
-
-  Although the non-batch complexity is O(n**2), large constants and sub-optimal
-  vectorization means the complexity of this function is 5x slower than zeroing
-  out the upper triangular, i.e., `tf.matrix_band_part(X, -1, 0)`. This
-  function becomes competitive only when several matmul/cholesky/etc ops can be
-  ellided in constructing the input. Example: wiring a fully connected layer as
-  a covariance matrix; this function reduces the final layer by 2x and possibly
-  reduces the network arch complexity considerably. In most cases it is better
-  to simply build a full matrix and zero out the upper triangular elements,
-  e.g., `tril = tf.matrix_band_part(full, -1, 0)`, rather than directly
-  construct a lower triangular.
-
-  Example:
-
-  ```python
-  fill_lower_triangular([1, 2, 3, 4, 5, 6])
-  # Returns: [[1, 0, 0],
-  #           [2, 3, 0],
-  #           [4, 5, 6]]
-  ```
-
-  For comparison, a pure numpy version of this function can be found in
-  `distribution_util_test.py`, function `_fill_lower_triangular`.
-
-  Args:
-    x: `Tensor` representing lower triangular elements.
-    validate_args: Python `bool`, default `False`. Whether to ensure the shape
-      of `x` can be mapped to a lower triangular matrix (controls non-static
-      checks only).
-    name: Python `str`. The name to give this op.
-
-  Returns:
-    tril: `Tensor` with lower triangular elements filled from `x`.
-
-  Raises:
-    ValueError: if shape if `x` has static shape which cannot be mapped to a
-      lower triangular matrix.
-  """
-  # TODO(jvdillon): Replace this code with dedicated op when it exists.
-  with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
-    if (x.get_shape().ndims is not None and
-        x.get_shape()[-1].value is not None):
-      d = x.get_shape()[-1].value
-      # d = n(n+1)/2 implies n is:
-      n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))
-      d_inferred = n * (n + 1) /2
-      if d != d_inferred:
-        raise ValueError("Input cannot be mapped to a lower triangular; "
-                         "n*(n+1)/2 = %d != %d" % (d_inferred, d))
-      final_shape = x.get_shape()[:-1].concatenate(
-          tensor_shape.TensorShape([n, n]))
-    else:
-      d = math_ops.cast(array_ops.shape(x)[-1], dtype=dtypes.float32)
-      # d = n(n+1)/2 implies n is:
-      n = math_ops.cast(0.5 * (dtypes.sqrt(1. + 8. * d) - 1.),
-                        dtype=dtypes.int32)
-      if validate_args:
-        is_valid_input_shape = check_ops.assert_equal(
-            n * (n + 1) / 2, d,
-            message="Input cannot be mapped to a lower triangular.")
-        n = control_flow_ops.with_dependencies([is_valid_input_shape], n)
-      final_shape = x.get_shape()[:-1].concatenate(
-          tensor_shape.TensorShape([None, None]))
-
-    def tril_ids(n):
-      """Internal helper to create vector of linear indices into y."""
-      # Build the ids statically; chose 512 because it implies 1MiB.
-      if not tensor_util.is_tensor(n) and n <= 512:
-        ids = np.arange(n**2, dtype=np.int32)
-        rows = (ids / n).astype(np.int32)  # Implicit floor.
-        # We need to stop incrementing the index when we encounter
-        # upper-triangular elements. The idea here is to compute the
-        # lower-right number of zeros then by "symmetry" subtract this from the
-        # total number of zeros, n(n-1)/2.
-        # Then we note that: n(n-1)/2 - (n-r)*(n-r-1)/2 = r(2n-r-1)/2
-        offset = (rows * (2 * n - rows - 1) / 2).astype(np.int32)
-        # We could also zero out when (rows < cols) == (rows < ids-n*rows).
-        # mask = (ids <= (n + 1) * rows).astype(np.int32)
-      else:
-        ids = math_ops.range(n**2)
-        rows = math_ops.cast(ids / n, dtype=dtypes.int32)
-        offset = math_ops.cast(rows * (2 * n - rows - 1) / 2,
-                               dtype=dtypes.int32)
-      return ids - offset
-
-    # Special-case non-batch case.
-    if x.get_shape().ndims == 1:
-      y = array_ops.gather(x, array_ops.reshape(tril_ids(n), [n, n]))
-      y = array_ops.matrix_band_part(y, -1, 0)
-      y.set_shape(y.get_shape().merge_with(final_shape))
-      return y
-
-    # Make ids for each batch dim.
-    if (x.get_shape().ndims is not None and
-        x.get_shape()[:-1].is_fully_defined()):
-      batch_shape = np.asarray(x.get_shape()[:-1].as_list(), dtype=np.int32)
-      m = np.prod(batch_shape).astype(np.int32)
-    else:
-      batch_shape = array_ops.shape(x)[:-1]
-      m = array_ops.reduce_prod(array_ops.shape(x)[:-1])
-    batch_ids = math_ops.range(m)
-
-    # Assemble the tril_ids into batch,tril_id pairs.
-    idx = array_ops.stack([
-        array_ops.tile(array_ops.expand_dims(batch_ids, 1), [1, n * n]),
-        array_ops.tile(array_ops.expand_dims(tril_ids(n), 0), [m, 1])
-    ])
-    idx = array_ops.transpose(idx, [1, 2, 0])
-
-    # Gather up, reshape, and return.
-    y = array_ops.reshape(x, [-1, d])
-    y = array_ops.gather_nd(y, idx)
-    y = array_ops.reshape(y, array_ops.concat([batch_shape, [n, n]], 0))
-    y = array_ops.matrix_band_part(y, -1, 0)
-    y.set_shape(y.get_shape().merge_with(final_shape))
-    return y
-
-
-# TODO(jvdillon): Merge this test back into:
-# tensorflow/python/ops/softplus_op_test.py
-# once TF core is accepting new ops.
-def softplus_inverse(x, name=None):
-  """Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)).
-
-  Mathematically this op is equivalent to:
-
-  ```none
-  softplus_inverse = log(exp(x) - 1.)
-  ```
-
-  Args:
-    x: `Tensor`. Non-negative (not enforced), floating-point.
-    name: A name for the operation (optional).
-
-  Returns:
-    `Tensor`. Has the same type/shape as input `x`.
-  """
-  with ops.name_scope(name, "softplus_inverse", values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
-    # We begin by deriving a more numerically stable softplus_inverse:
-    # x = softplus(y) = Log[1 + exp{y}], (which means x > 0).
-    # ==> exp{x} = 1 + exp{y}                                (1)
-    # ==> y = Log[exp{x} - 1]                                (2)
-    #       = Log[(exp{x} - 1) / exp{x}] + Log[exp{x}]
-    #       = Log[(1 - exp{-x}) / 1] + Log[exp{x}]
-    #       = Log[1 - exp{-x}] + x                           (3)
-    # (2) is the "obvious" inverse, but (3) is more stable than (2) for large x.
-    # For small x (e.g. x = 1e-10), (3) will become -inf since 1 - exp{-x} will
-    # be zero. To fix this, we use 1 - exp{-x} approx x for small x > 0.
-    #
-    # In addition to the numerically stable derivation above, we clamp
-    # small/large values to be congruent with the logic in:
-    # tensorflow/core/kernels/softplus_op.h
-    #
-    # Finally, we set the input to one whenever the input is too large or too
-    # small. This ensures that no unchosen codepath is +/- inf. This is
-    # necessary to ensure the gradient doesn't get NaNs. Recall that the
-    # gradient of `where` behaves like `pred*pred_true + (1-pred)*pred_false`
-    # thus an `inf` in an unselected path results in `0*inf=nan`. We are careful
-    # to overwrite `x` with ones only when we will never actually use this
-    # value. Note that we use ones and not zeros since `log(expm1(0.)) = -inf`.
-    threshold = np.log(np.finfo(x.dtype.as_numpy_dtype).eps) + 2.
-    is_too_small = math_ops.less(x, np.exp(threshold))
-    is_too_large = math_ops.greater(x, -threshold)
-    too_small_value = math_ops.log(x)
-    too_large_value = x
-    # This `where` will ultimately be a NOP because we won't select this
-    # codepath whenever we used the surrogate `ones_like`.
-    x = array_ops.where(math_ops.logical_or(is_too_small, is_too_large),
-                        array_ops.ones_like(x), x)
-    y = x + math_ops.log(-math_ops.expm1(-x))  # == log(expm1(x))
-    return array_ops.where(is_too_small, too_small_value,
-                           array_ops.where(is_too_large, too_large_value, y))
-
-
-# TODO(b/35290280): Add unit-tests.
-def dimension_size(x, axis):
-  """Returns the size of a specific dimension."""
-  # Since tf.gather isn't "constant-in, constant-out", we must first check the
-  # static shape or fallback to dynamic shape.
-  num_rows = (None if x.get_shape().ndims is None
-              else x.get_shape()[axis].value)
-  if num_rows is not None:
-    return num_rows
-  return array_ops.shape(x)[axis]
+from tensorflow.python.ops.distributions import util
+from tensorflow.python.ops.distributions.util import *  # pylint: disable=wildcard-import
 
 
 # TODO(b/35290280): Add unit-tests.
@@ -648,13 +44,11 @@ def make_diag_scale(loc, scale_diag, scale_identity_multiplier,
           check_ops.assert_positive(
               x, message="diagonal part must be positive"),
       ], x)
-    # TODO(b/35157376): Use `assert_none_equal` once it exists.
     return control_flow_ops.with_dependencies([
-        check_ops.assert_greater(
-            math_ops.abs(x),
+        check_ops.assert_none_equal(
+            x,
             array_ops.zeros([], x.dtype),
-            message="diagonal part must be non-zero"),
-    ], x)
+            message="diagonal part must be non-zero")], x)
 
   with ops.name_scope(name, "make_diag_scale",
                       values=[loc, scale_diag, scale_identity_multiplier]):
@@ -678,7 +72,7 @@ def make_diag_scale(loc, scale_diag, scale_identity_multiplier,
       raise ValueError(
           "Cannot infer `event_shape` unless `loc` is specified.")
 
-    num_rows = dimension_size(loc, -1)
+    num_rows = util.dimension_size(loc, -1)
 
     if scale_identity_multiplier is None:
       return linalg.LinearOperatorIdentity(
@@ -697,62 +91,108 @@ def make_diag_scale(loc, scale_diag, scale_identity_multiplier,
         assert_proper_shapes=validate_args)
 
 
-class AppendDocstring(object):
-  """Helper class to promote private subclass docstring to public counterpart.
+def shapes_from_loc_and_scale(loc, scale, name="shapes_from_loc_and_scale"):
+  """Infer distribution batch and event shapes from a location and scale.
 
-  Example:
+  Location and scale family distributions determine their batch/event shape by
+  broadcasting the `loc` and `scale` args.  This helper does that broadcast,
+  statically if possible.
 
-  ```python
-  class TransformedDistribution(Distribution):
-    @distribution_util.AppendDocstring(
-      additional_note="A special note!",
-      kwargs_dict={"foo": "An extra arg."})
-    def _prob(self, y, foo=None):
-      pass
-  ```
+  Batch shape broadcasts as per the normal rules.
+  We allow the `loc` event shape to broadcast up to that of `scale`.  We do not
+  allow `scale`'s event shape to change.  Therefore, the last dimension of `loc`
+  must either be size `1`, or the same as `scale.range_dimension`.
 
-  In this case, the `AppendDocstring` decorator appends the `additional_note` to
-  the docstring of `prob` (not `_prob`) and adds a new `kwargs`
-  section with each dictionary item as a bullet-point.
+  See `MultivariateNormalLinearOperator` for a usage example.
 
-  For a more detailed example, see `TransformedDistribution`.
-  """
+  Args:
+    loc:  `N-D` `Tensor` with `N >= 1` (already converted to tensor) or `None`.
+      If `None`, both batch and event shape are determined by `scale`.
+    scale:  A `LinearOperator` instance.
+    name:  A string name to prepend to created ops.
 
-  def __init__(self, additional_note="", kwargs_dict=None):
-    """Initializes the AppendDocstring object.
+  Returns:
+    batch_shape:  `TensorShape` (if broadcast is done statically), or `Tensor`.
+    event_shape:  `TensorShape` (if broadcast is done statically), or `Tensor`.
 
-    Args:
-      additional_note: Python string added as additional docstring to public
-        version of function.
-      kwargs_dict: Python string/string dictionary representing
-        specific kwargs expanded from the **kwargs input.
+  Raises:
+    ValueError:  If the last dimension of `loc` is determined statically to be
+      different than the range of `scale`.
+  """
+  with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    # Get event shape.
+    event_size = scale.range_dimension_tensor()
+    event_size_const = tensor_util.constant_value(event_size)
+    if event_size_const is not None:
+      event_shape = event_size_const.reshape([1])
+    else:
+      event_shape = event_size[array_ops.newaxis]
 
-    Raises:
-      ValueError: if kwargs_dict.key contains whitespace.
-      ValueError: if kwargs_dict.value contains newlines.
-    """
-    self._additional_note = additional_note
-    if kwargs_dict:
-      bullets = []
-      for key in sorted(kwargs_dict.keys()):
-        value = kwargs_dict[key]
-        if any(x.isspace() for x in key):
+    # Static check that event shapes match.
+    if loc is not None:
+      loc_event_size = loc.get_shape()[-1].value
+      if loc_event_size is not None and event_size_const is not None:
+        if loc_event_size != 1 and loc_event_size != event_size_const:
           raise ValueError(
-              "Parameter name \"%s\" contains whitespace." % key)
-        value = value.lstrip()
-        if "\n" in value:
-          raise ValueError(
-              "Parameter description for \"%s\" contains newlines." % key)
-        bullets.append("*  `%s`: %s" % (key, value))
-      self._additional_note += ("\n\n##### `kwargs`:\n\n" +
-                                "\n".join(bullets))
+              "Event size of 'scale' (%d) could not be broadcast up to that of "
+              "'loc' (%d)." % (loc_event_size, event_size_const))
 
-  def __call__(self, fn):
-    @functools.wraps(fn)
-    def _fn(*args, **kwargs):
-      return fn(*args, **kwargs)
-    if _fn.__doc__ is None:
-      _fn.__doc__ = self._additional_note
+    # Get batch shape.
+    batch_shape = scale.batch_shape_tensor()
+    if loc is None:
+      batch_shape_const = tensor_util.constant_value(batch_shape)
+      batch_shape = (
+          batch_shape_const if batch_shape_const is not None else batch_shape)
     else:
-      _fn.__doc__ += "\n%s" % self._additional_note
-    return _fn
+      loc_batch_shape = loc.get_shape().with_rank_at_least(1)[:-1]
+      if (loc.get_shape().ndims is None or
+          not loc_batch_shape.is_fully_defined()):
+        loc_batch_shape = array_ops.shape(loc)[:-1]
+      else:
+        loc_batch_shape = ops.convert_to_tensor(loc_batch_shape,
+                                                name="loc_batch_shape")
+      batch_shape = prefer_static_broadcast_shape(batch_shape, loc_batch_shape)
+
+  return batch_shape, event_shape
+
+
+def prefer_static_broadcast_shape(
+    shape1, shape2, name="prefer_static_broadcast_shape"):
+  """Convenience function which statically broadcasts shape when possible.
+
+  Args:
+    shape1:  `1-D` integer `Tensor`.  Already converted to tensor!
+    shape2:  `1-D` integer `Tensor`.  Already converted to tensor!
+    name:  A string name to prepend to created ops.
+
+  Returns:
+    The broadcast shape, either as `TensorShape` (if broadcast can be done
+      statically), or as a `Tensor`.
+  """
+  with ops.name_scope(name, values=[shape1, shape2]):
+    if (tensor_util.constant_value(shape1) is not None and
+        tensor_util.constant_value(shape2) is not None):
+      return array_ops.broadcast_static_shape(
+          tensor_shape.TensorShape(tensor_util.constant_value(shape1)),
+          tensor_shape.TensorShape(tensor_util.constant_value(shape2)))
+    return array_ops.broadcast_dynamic_shape(shape1, shape2)
+
+
+def is_diagonal_scale(scale):
+  """Returns `True` if `scale` is a `LinearOperator` that is known to be diag.
+
+  Args:
+    scale:  `LinearOperator` instance.
+
+  Returns:
+    Python `bool`.
+
+  Raises:
+    TypeError:  If `scale` is not a `LinearOperator`.
+  """
+  if not isinstance(scale, linalg.LinearOperator):
+    raise TypeError("Expected argument 'scale' to be instance of LinearOperator"
+                    ". Found: %s" % scale)
+  return (isinstance(scale, linalg.LinearOperatorIdentity) or
+          isinstance(scale, linalg.LinearOperatorScaledIdentity) or
+          isinstance(scale, linalg.LinearOperatorDiag))
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
index aa7aef6681f16eb8220ebecccd4208ff4328ca4a..918200830c35536e110b9a2ce4fdf35e55caac18 100644
--- a/tensorflow/contrib/distributions/python/ops/geometric.py
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -19,8 +19,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -31,6 +29,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Geometric(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index f99a6674e573ef07a8303d5ae474006da9e4d182..ba8d3c639b397422f0f6210ba9f48650f0da1e3e 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import math
 import numpy as np
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
 
 
 class _Gumbel(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 7b91b00c2cddf13ebee1be6cf6beac68b9df09f4..956dee38a378813434656a28a69c89b6ec1e8b72 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +30,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 6af16041c08b242c69841742c0f2b4dcae908a00..ce1a459cae9f409d4f7aeed1508eefe547863fae 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import math
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -31,6 +30,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
 
 
 class Logistic(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index 3e29ce75003d818e28a0552750e0660bc848e127..f3b09f60f3e906daf073eacb90834920f506bb96 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import categorical
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -31,6 +28,9 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Mixture(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index 946c1dc16b5fd69d514e8435a5d3efa94ead8189..163cf75d990d5fe7ec1e3aaf0040fc71f61774a7 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -146,8 +146,8 @@ class MultivariateNormalDiag(
     The `batch_shape` is the broadcast shape between `loc` and `scale`
     arguments.
 
-    The `event_shape` is given by the last dimension of `loc` or the last
-    dimension of the matrix implied by `scale`.
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
 
     Recall that `covariance = scale @ scale.T`. A (non-batch) `scale` matrix is:
 
@@ -197,11 +197,14 @@ class MultivariateNormalDiag(
     with ops.name_scope(name):
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
+        # No need to validate_args while making diag_scale.  The returned
+        # LinearOperatorDiag has an assert_non_singular method that is called by
+        # the Bijector.
         scale = distribution_util.make_diag_scale(
             loc=loc,
             scale_diag=scale_diag,
             scale_identity_multiplier=scale_identity_multiplier,
-            validate_args=validate_args,
+            validate_args=False,
             assert_positive=False)
     super(MultivariateNormalDiag, self).__init__(
         loc=loc,
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 1d4132cfd4ba58ae486fcc785244f70544985425..ee3e02e0203a3338b7e6a40b7e3ff30c0a0940f0 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -155,8 +155,8 @@ class MultivariateNormalDiagPlusLowRank(
     The `batch_shape` is the broadcast shape between `loc` and `scale`
     arguments.
 
-    The `event_shape` is given by the last dimension of `loc` or the last
-    dimension of the matrix implied by `scale`.
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
 
     Recall that `covariance = scale @ scale.T`. A (non-batch) `scale` matrix is:
 
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..84809d8dc45dcafbdfa5e8771355d712812706e7
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -0,0 +1,187 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multivariate Normal distribution class initialized with a full covariance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import mvn_tril
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+
+
+__all__ = [
+    "MultivariateNormalFullCovariance",
+]
+
+
+class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
+  """The multivariate normal distribution on `R^k`.
+
+  The Multivariate Normal distribution is defined over `R^k` and parameterized
+  by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
+  `covariance_matrix` matrices that are the covariance.
+  This is different than the other multivariate normals, which are parameterized
+  by a matrix more akin to the standard deviation.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is, with `@` as matrix multiplication,
+
+  ```none
+  pdf(x; loc, covariance_matrix) = exp(-0.5 ||y||**2) / Z,
+  y = (x - loc)^T @ inv(covariance_matrix) @ (x - loc)
+  Z = (2 pi)**(0.5 k) |det(covariance_matrix)|**(0.5).
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `covariance_matrix` is an `R^{k x k}` symmetric positive definite matrix,
+  * `Z` denotes the normalization constant, and,
+  * `||y||**2` denotes the squared Euclidean norm of `y`.
+
+  Additional leading dimensions (if any) in `loc` and `covariance_matrix` allow
+  for batch dimensions.
+
+  The MultivariateNormal distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed e.g. as,
+
+  ```none
+  X ~ MultivariateNormal(loc=0, scale=1)   # Identity scale, zero shift.
+  scale = Cholesky(covariance_matrix)
+  Y = scale @ X + loc
+  ```
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+
+  # Initialize a single 3-variate Gaussian.
+  mu = [1., 2, 3]
+  cov = [[ 0.36,  0.12,  0.06],
+         [ 0.12,  0.29, -0.13],
+         [ 0.06, -0.13,  0.26]]
+  mvn = ds.MultivariateNormalFullCovariance(
+      loc=mu,
+      covariance_matrix=cov)
+
+  mvn.mean().eval()
+  # ==> [1., 2, 3]
+
+  # Covariance agrees with covariance_matrix.
+  mvn.covariance().eval()
+  # ==> [[ 0.36,  0.12,  0.06],
+  #      [ 0.12,  0.29, -0.13],
+  #      [ 0.06, -0.13,  0.26]]
+
+  # Compute the pdf of an observation in `R^3` ; return a scalar.
+  mvn.prob([-1., 0, 1]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Gaussians.
+  mu = [[1., 2, 3],
+        [11, 22, 33]]              # shape: [2, 3]
+  covariance_matrix = ...  # shape: [2, 3, 3], symmetric, positive definite.
+  mvn = ds.MultivariateNormalFullCovariance(
+      loc=mu,
+      covariance=covariance_matrix)
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[-0.9, 0, 0.1],
+       [-10, 0, 9]]     # shape: [2, 3]
+  mvn.prob(x).eval()    # shape: [2]
+
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               covariance_matrix=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="MultivariateNormalFullCovariance"):
+    """Construct Multivariate Normal distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and
+    `covariance_matrix` arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `covariance_matrix`. The last dimension of `loc` (if provided) must
+    broadcast with this.
+
+    A non-batch `covariance_matrix` matrix is a `k x k` symmetric positive
+    definite matrix.  In other words it is (real) symmetric with all eigenvalues
+    strictly positive.
+
+    Additional leading dimensions (if any) will index batches.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      covariance_matrix: Floating-point, symmetric positive definite `Tensor` of
+        same `dtype` as `loc`.  The strict upper triangle of `covariance_matrix`
+        is ignored, so if `covariance_matrix` is not symmetric no error will be
+        raised (unless `validate_args is True`).  `covariance_matrix` has shape
+        `[B1, ..., Bb, k, k]` where `b >= 0` and `k` is the event size.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if neither `loc` nor `covariance_matrix` are specified.
+    """
+    parameters = locals()
+
+    # Convert the covariance_matrix up to a scale_tril and call MVNTriL.
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[loc, covariance_matrix]):
+        if covariance_matrix is None:
+          scale_tril = None
+        else:
+          covariance_matrix = ops.convert_to_tensor(
+              covariance_matrix, name="covariance_matrix")
+          if validate_args:
+            assert_symmetric = check_ops.assert_equal(
+                covariance_matrix,
+                array_ops.matrix_transpose(covariance_matrix),
+                message="Matrix was not symmetric.")
+            covariance_matrix = control_flow_ops.with_dependencies(
+                [assert_symmetric], covariance_matrix)
+          # No need to validate that covariance_matrix is non-singular.
+          # LinearOperatorTriL has an assert_non_singular method that is called
+          # by the Bijector.
+          # However, cholesky() ignores the upper triangular part, so we do need
+          # to separately assert symmetric.
+          scale_tril = linalg_ops.cholesky(covariance_matrix)
+        super(MultivariateNormalFullCovariance, self).__init__(
+            loc=loc,
+            scale_tril=scale_tril,
+            validate_args=validate_args,
+            allow_nan_stats=allow_nan_stats)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index 3bb6bb4af2a8c919dba6f02a6f37998db9115a85..b25250d3671ff68a8362c7f2eaa8f586900f27e2 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -21,15 +21,13 @@ from __future__ import print_function
 from tensorflow.contrib import linalg
 from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
+from tensorflow.python.ops.distributions import transformed_distribution
 
 
 __all__ = [
@@ -54,16 +52,6 @@ or
 """
 
 
-def _broadcast_shape(shape1, shape2):
-  """Convenience function which statically broadcasts shape when possible."""
-  if (tensor_util.constant_value(shape1) is not None and
-      tensor_util.constant_value(shape2) is not None):
-    return array_ops.broadcast_static_shape(
-        tensor_shape.TensorShape(tensor_util.constant_value(shape1)),
-        tensor_shape.TensorShape(tensor_util.constant_value(shape2)))
-  return array_ops.broadcast_dynamic_shape(shape1, shape2)
-
-
 # TODO(b/35290280): Import in `../../__init__.py` after adding unit-tests.
 class MultivariateNormalLinearOperator(
     transformed_distribution.TransformedDistribution):
@@ -71,7 +59,7 @@ class MultivariateNormalLinearOperator(
 
   The Multivariate Normal distribution is defined over `R^k` and parameterized
   by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
-  `scale` matrix; `covariance = scale @ scale.T` where `@` denotes
+  `scale` matrix; `covariance = scale @ scale.T`, where `@` denotes
   matrix-multiplication.
 
   #### Mathematical Details
@@ -158,8 +146,8 @@ class MultivariateNormalLinearOperator(
     The `batch_shape` is the broadcast shape between `loc` and `scale`
     arguments.
 
-    The `event_shape` is given by the last dimension of `loc` or the last
-    dimension of the matrix implied by `scale`.
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
 
     Recall that `covariance = scale @ scale.T`.
 
@@ -193,22 +181,9 @@ class MultivariateNormalLinearOperator(
     with ops.name_scope(name, values=[loc] + scale.graph_parents):
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
-      event_shape = scale.range_dimension_tensor()
-      if tensor_util.constant_value(event_shape) is not None:
-        event_shape = tensor_util.constant_value(event_shape).reshape([1])
-      else:
-        event_shape = event_shape[array_ops.newaxis]
-      batch_shape = scale.batch_shape_tensor()
-      if loc is not None:
-        loc = ops.convert_to_tensor(loc, name="loc")
-        loc_batch_shape = loc.get_shape().with_rank_at_least(1)[:-1]
-        if (loc.get_shape().ndims is None or
-            not loc_batch_shape.is_fully_defined()):
-          loc_batch_shape = array_ops.shape(loc)[:-1]
-        else:
-          loc_batch_shape = ops.convert_to_tensor(loc_batch_shape,
-                                                  name="loc_batch_shape")
-        batch_shape = _broadcast_shape(batch_shape, loc_batch_shape)
+      loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
 
     super(MultivariateNormalLinearOperator, self).__init__(
         distribution=normal.Normal(
@@ -232,18 +207,6 @@ class MultivariateNormalLinearOperator(
     """The `scale` `LinearOperator` in `Y = scale @ X + loc`."""
     return self.bijector.scale
 
-  def log_det_covariance(self, name="log_det_covariance"):
-    """Log of determinant of covariance matrix."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.scale.graph_parents):
-        return 2. * self.scale.log_abs_determinant()
-
-  def det_covariance(self, name="det_covariance"):
-    """Determinant of covariance matrix."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.scale.graph_parents):
-        return math_ops.exp(2.* self.scale.log_abs_determinant())
-
   @distribution_util.AppendDocstring(_mvn_sample_note)
   def _log_prob(self, x):
     return super(MultivariateNormalLinearOperator, self)._log_prob(x)
@@ -272,41 +235,32 @@ class MultivariateNormalLinearOperator(
     return array_ops.identity(self.loc) + array_ops.zeros(shape, self.dtype)
 
   def _covariance(self):
-    if (isinstance(self.scale, linalg.LinearOperatorIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorDiag)):
+    if distribution_util.is_diagonal_scale(self.scale):
       return array_ops.matrix_diag(math_ops.square(self.scale.diag_part()))
     else:
-      # TODO(b/35040238): Remove transpose once LinOp supports `transpose`.
-      return self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense()))
+      return self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
 
   def _variance(self):
-    if (isinstance(self.scale, linalg.LinearOperatorIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorDiag)):
+    if distribution_util.is_diagonal_scale(self.scale):
       return math_ops.square(self.scale.diag_part())
     elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
           and self.scale.is_self_adjoint):
       return array_ops.matrix_diag_part(
-          self.scale.apply(self.scale.to_dense()))
+          self.scale.matmul(self.scale.to_dense()))
     else:
-      # TODO(b/35040238): Remove transpose once LinOp supports `transpose`.
       return array_ops.matrix_diag_part(
-          self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense())))
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True))
 
   def _stddev(self):
-    if (isinstance(self.scale, linalg.LinearOperatorIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorDiag)):
+    if distribution_util.is_diagonal_scale(self.scale):
       return math_ops.abs(self.scale.diag_part())
     elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
           and self.scale.is_self_adjoint):
       return math_ops.sqrt(array_ops.matrix_diag_part(
-          self.scale.apply(self.scale.to_dense())))
+          self.scale.matmul(self.scale.to_dense())))
     else:
-      # TODO(b/35040238): Remove transpose once LinOp supports `transpose`.
       return math_ops.sqrt(array_ops.matrix_diag_part(
-          self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense()))))
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)))
 
   def _mode(self):
     return self._mean()
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index e2a31c862494208477005b2be67dce1f9e95658d..d662b25e1e1dc1dc1053c22aef9fe6b7a440cdc0 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -19,13 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import linalg
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import mvn_linear_operator as mvn_linop
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -55,7 +51,7 @@ class MultivariateNormalTriL(
   where:
 
   * `loc` is a vector in `R^k`,
-  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `scale` is a matrix in `R^{k x k}`, `covariance = scale @ scale.T`,
   * `Z` denotes the normalization constant, and,
   * `||y||**2` denotes the squared Euclidean norm of `y`.
 
@@ -140,8 +136,8 @@ class MultivariateNormalTriL(
     The `batch_shape` is the broadcast shape between `loc` and `scale`
     arguments.
 
-    The `event_shape` is given by the last dimension of `loc` or the last
-    dimension of the matrix implied by `scale`.
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
 
     Recall that `covariance = scale @ scale.T`. A (non-batch) `scale` matrix is:
 
@@ -191,14 +187,9 @@ class MultivariateNormalTriL(
               is_positive_definite=True,
               assert_proper_shapes=validate_args)
         else:
-          if validate_args:
-            scale_tril = control_flow_ops.with_dependencies([
-                # TODO(b/35157376): Use `assert_none_equal` once it exists.
-                check_ops.assert_greater(
-                    math_ops.abs(array_ops.matrix_diag_part(scale_tril)),
-                    array_ops.zeros([], scale_tril.dtype),
-                    message="`scale_tril` must have non-zero diagonal"),
-            ], scale_tril)
+          # No need to validate that scale_tril is non-singular.
+          # LinearOperatorTriL has an assert_non_singular method that is called
+          # by the Bijector.
           scale = linalg.LinearOperatorTriL(
               scale_tril,
               is_non_singular=True,
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
index 294b733c3c0187e51be3704a50d63cae12cc1d6a..8895fd8b465bf1f1e6f6b818cfbfc1aaa86a522e 100644
--- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -27,6 +25,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class NegativeBinomial(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
index 10e934326a1f0983325f8c99eaa7b03a0e3589be..4025285780b63560181b912635325ce7ebdc3ec2 100644
--- a/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
+++ b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
@@ -18,9 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.normal import Normal  # pylint: disable=line-too-long
-
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import normal
 
 
 def normal_conjugates_known_scale_posterior(prior, scale, s, n):
@@ -65,7 +64,7 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
     TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
       Normal object.
   """
-  if not isinstance(prior, Normal):
+  if not isinstance(prior, normal.Normal):
     raise TypeError("Expected prior to be an instance of type Normal")
 
   if s.dtype != prior.dtype:
@@ -77,7 +76,7 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
   scale0_2 = math_ops.square(prior.scale)
   scale_2 = math_ops.square(scale)
   scalep_2 = 1.0/(1/scale0_2 + n/scale_2)
-  return Normal(
+  return normal.Normal(
       loc=(prior.loc/scale0_2 + s/scale_2) * scalep_2,
       scale=math_ops.sqrt(scalep_2))
 
@@ -131,7 +130,7 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
     TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
       Normal object.
   """
-  if not isinstance(prior, Normal):
+  if not isinstance(prior, normal.Normal):
     raise TypeError("Expected prior to be an instance of type Normal")
 
   if s.dtype != prior.dtype:
@@ -143,6 +142,6 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
   scale0_2 = math_ops.square(prior.scale)
   scale_2 = math_ops.square(scale)
   scalep_2 = 1.0/(1/scale0_2 + n/scale_2)
-  return Normal(
+  return normal.Normal(
       loc=(prior.loc/scale0_2 + s/scale_2) * scalep_2,
       scale=math_ops.sqrt(scalep_2 + scale_2))
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index 1679a797e131ca7f24b010bceb600569560f456d..b76cebf79fad09ebec68f2459c6fe80794ea81c0 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -18,9 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -29,6 +26,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class OneHotCategorical(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_full.py b/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
index 548374bcde67fec955247e1bb3ab63ae99d19014..3ca341bb830b0baafa75765abe7f695021bfed1e 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py b/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
index e6947bf60935528ed8fc6f2ba76e0be0980c76f3..4cee2997909dbd105fd045be9ea1238a343a2c27 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
@@ -115,7 +115,7 @@ class OperatorPDIdentity(operator_pd.OperatorPDBase):
     """Static check that the argument `x` is proper `shape`, `dtype`."""
     # x is a typical argument e.g. to matmul or solve.  In both cases, x should
     # have the same type/shape since this is a square matrix.  These checks are
-    # ususally not needed since we ususally have some tensor backing this
+    # usually not needed since we usually have some tensor backing this
     # distribution, and the calls to tf.matmul do a shape/type check.
     #
     # Static checks only for efficiency, the identity should be fast.
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index 3c4f1cc1d895c47bf629f985467053d57c144edb..d9929183c1a85f2ed16f289c795c4c7bf46caec0 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -28,6 +26,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = [
     "Poisson",
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index 6241dbfcf021bf50668a1ac17d31c8bf34777f49..8aebb79b9138cce1373e6472d17cf9072d2bc285 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution as distributions
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distributions
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = ["QuantizedDistribution"]
 
@@ -232,7 +232,7 @@ class QuantizedDistribution(distributions.Distribution):
       graph_parents = self._dist._graph_parents  # pylint: disable=protected-access
 
       checks = []
-      if low is not None and high is not None:
+      if validate_args and low is not None and high is not None:
         message = "low must be strictly less than high."
         checks.append(
             check_ops.assert_less(
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index 2c7db7214b52dc8cca38aec9db1abefada0c9869..5b57a95c55eca7f3d6301c1e87a6cf52f040ab26 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -18,9 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import logistic
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 # Bijectors must be directly imported because `remove_undocumented` prevents
 # individual file imports.
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import Sigmoid
@@ -28,6 +26,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index e81c5935a11d1a53f058b3de7563bc8b4c0137b4..da1cd72a6f13f7c585a60d0be122c212671fe5e8 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -31,6 +28,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class ExpRelaxedOneHotCategorical(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index b1271d61c0ad45f48733fd61c541417e58f0c618..516d7b60fecbffec197a40ae361204a9b620988a 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import contextlib
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -27,6 +26,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class _DistributionShape(object):
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e3867809a820f49cfa7f5282c47f786626481a6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
@@ -0,0 +1,232 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution of a vectorized Laplace, with uncorrelated components."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import vector_laplace_linear_operator as vector_laplace_linop
+from tensorflow.python.framework import ops
+
+
+__all__ = [
+    "VectorLaplaceDiag",
+]
+
+
+class VectorLaplaceDiag(
+    vector_laplace_linop.VectorLaplaceLinearOperator):
+  """The vectorization of the Laplace distribution on `R^k`.
+
+  The vector laplace distribution is defined over `R^k`, and parameterized by
+  a (batch of) length-`k` `loc` vector (the means) and a (batch of) `k x k`
+  `scale` matrix:  `covariance = 2 * scale @ scale.T`, where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-||y||_1) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = 2**k |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||_1` denotes the `l1` norm of `y`, `sum_i |y_i|.
+
+  A (non-batch) `scale` matrix is:
+
+  ```none
+  scale = diag(scale_diag + scale_identity_multiplier * ones(k))
+  ```
+
+  where:
+
+  * `scale_diag.shape = [k]`, and,
+  * `scale_identity_multiplier.shape = []`.
+
+  Additional leading dimensions (if any) will index batches.
+
+  If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+  `scale` is the Identity matrix.
+
+  The VectorLaplace distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X = (X_1, ..., X_k), each X_i ~ Laplace(loc=0, scale=1)
+  Y = (Y_1, ...,Y_k) = scale @ X + loc
+  ```
+
+  #### About `VectorLaplace` and `Vector` distributions in TensorFlow.
+
+  The `VectorLaplace` is a non-standard distribution that has useful properties.
+
+  The marginals `Y_1, ..., Y_k` are *not* Laplace random variables, due to
+  the fact that the sum of Laplace random variables is not Laplace.
+
+  Instead, `Y` is a vector whose components are linear combinations of Laplace
+  random variables.  Thus, `Y` lives in the vector space generated by `vectors`
+  of Laplace distributions.  This allows the user to decide the mean and
+  covariance (by setting `loc` and `scale`), while preserving some properties of
+  the Laplace distribution.  In particular, the tails of `Y_i` will be (up to
+  polynomial factors) exponentially decaying.
+
+  To see this last statement, note that the pdf of `Y_i` is the convolution of
+  the pdf of `k` independent Laplace random variables.  One can then show by
+  induction that distributions with exponential (up to polynomial factors) tails
+  are closed under convolution.
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+
+  # Initialize a single 2-variate VectorLaplace.
+  vla = ds.VectorLaplaceDiag(
+      loc=[1., -1],
+      scale_diag=[1, 2.])
+
+  vla.mean().eval()
+  # ==> [1., -1]
+
+  vla.stddev().eval()
+  # ==> [1., 2] * sqrt(2)
+
+  # Evaluate this on an observation in `R^2`, returning a scalar.
+  vla.prob([-1., 0]).eval()  # shape: []
+
+  # Initialize a 3-batch, 2-variate scaled-identity VectorLaplace.
+  vla = ds.VectorLaplaceDiag(
+      loc=[1., -1],
+      scale_identity_multiplier=[1, 2., 3])
+
+  vla.mean().eval()  # shape: [3, 2]
+  # ==> [[1., -1]
+  #      [1, -1],
+  #      [1, -1]]
+
+  vla.stddev().eval()  # shape: [3, 2]
+  # ==> sqrt(2) * [[1., 1],
+  #                [2, 2],
+  #                [3, 3]]
+
+  # Evaluate this on an observation in `R^2`, returning a length-3 vector.
+  vla.prob([-1., 0]).eval()  # shape: [3]
+
+  # Initialize a 2-batch of 3-variate VectorLaplace's.
+  vla = ds.VectorLaplaceDiag(
+      loc=[[1., 2, 3],
+           [11, 22, 33]]           # shape: [2, 3]
+      scale_diag=[[1., 2, 3],
+                  [0.5, 1, 1.5]])  # shape: [2, 3]
+
+  # Evaluate this on a two observations, each in `R^3`, returning a length-2
+  # vector.
+  x = [[-1., 0, 1],
+       [-11, 0, 11.]]   # shape: [2, 3].
+  vla.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale_diag=None,
+               scale_identity_multiplier=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorLaplaceDiag"):
+    """Construct Vector Laplace distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = 2 * scale @ scale.T`.
+
+    ```none
+    scale = diag(scale_diag + scale_identity_multiplier * ones(k))
+    ```
+
+    where:
+
+    * `scale_diag.shape = [k]`, and,
+    * `scale_identity_multiplier.shape = []`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+    `scale` is the Identity matrix.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale_diag: Non-zero, floating-point `Tensor` representing a diagonal
+        matrix added to `scale`. May have shape `[B1, ..., Bb, k]`, `b >= 0`,
+        and characterizes `b`-batches of `k x k` diagonal matrices added to
+        `scale`. When both `scale_identity_multiplier` and `scale_diag` are
+        `None` then `scale` is the `Identity`.
+      scale_identity_multiplier: Non-zero, floating-point `Tensor` representing
+        a scaled-identity-matrix added to `scale`. May have shape
+        `[B1, ..., Bb]`, `b >= 0`, and characterizes `b`-batches of scaled
+        `k x k` identity matrices added to `scale`. When both
+        `scale_identity_multiplier` and `scale_diag` are `None` then `scale` is
+        the `Identity`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if at most `scale_identity_multiplier` is specified.
+    """
+    parameters = locals()
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[
+          loc, scale_diag, scale_identity_multiplier]):
+        # No need to validate_args while making diag_scale.  The returned
+        # LinearOperatorDiag has an assert_non_singular method that is called by
+        # the Bijector.
+        scale = distribution_util.make_diag_scale(
+            loc=loc,
+            scale_diag=scale_diag,
+            scale_identity_multiplier=scale_identity_multiplier,
+            validate_args=False,
+            assert_positive=False)
+    super(VectorLaplaceDiag, self).__init__(
+        loc=loc,
+        scale=scale,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2c46d94de9c031768be1410990b180b30497d2
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -0,0 +1,294 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Vectorized Laplace distribution class, directly using LinearOpeartor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import laplace
+from tensorflow.python.ops.distributions import transformed_distribution
+
+
+__all__ = [
+    "VectorLaplaceLinearOperator"
+]
+
+_mvn_sample_note = """
+`value` is a batch vector with compatible shape if `value` is a `Tensor` whose
+shape can be broadcast up to either:
+
+```python
+self.batch_shape + self.event_shape
+```
+
+or
+
+```python
+[M1, ..., Mm] + self.batch_shape + self.event_shape
+```
+
+"""
+
+
+class VectorLaplaceLinearOperator(
+    transformed_distribution.TransformedDistribution):
+  """The vectorization of the Laplace distribution on `R^k`.
+
+  The vector laplace distribution is defined over `R^k`, and parameterized by
+  a (batch of) length-`k` `loc` vector (the means) and a (batch of) `k x k`
+  `scale` matrix:  `covariance = 2 * scale @ scale.T`, where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-||y||_1) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = 2**k |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||_1` denotes the `l1` norm of `y`, `sum_i |y_i|.
+
+  The VectorLaplace distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X = (X_1, ..., X_k), each X_i ~ Laplace(loc=0, scale=1)
+  Y = (Y_1, ...,Y_k) = scale @ X + loc
+  ```
+
+  #### About `VectorLaplace` and `Vector` distributions in TensorFlow.
+
+  The `VectorLaplace` is a non-standard distribution that has useful properties.
+
+  The marginals `Y_1, ..., Y_k` are *not* Laplace random variables, due to
+  the fact that the sum of Laplace random variables is not Laplace.
+
+  Instead, `Y` is a vector whose components are linear combinations of Laplace
+  random variables.  Thus, `Y` lives in the vector space generated by `vectors`
+  of Laplace distributions.  This allows the user to decide the mean and
+  covariance (by setting `loc` and `scale`), while preserving some properties of
+  the Laplace distribution.  In particular, the tails of `Y_i` will be (up to
+  polynomial factors) exponentially decaying.
+
+  To see this last statement, note that the pdf of `Y_i` is the convolution of
+  the pdf of `k` independent Laplace random variables.  One can then show by
+  induction that distributions with exponential (up to polynomial factors) tails
+  are closed under convolution.
+
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+  la = tf.contrib.linalg
+
+  # Initialize a single 3-variate VectorLaplace with some desired covariance.
+  mu = [1., 2, 3]
+  cov = [[ 0.36,  0.12,  0.06],
+         [ 0.12,  0.29, -0.13],
+         [ 0.06, -0.13,  0.26]]
+
+  scale = tf.cholesky(cov)
+  # ==> [[ 0.6,  0. ,  0. ],
+  #      [ 0.2,  0.5,  0. ],
+  #      [ 0.1, -0.3,  0.4]])
+
+  # Divide scale by sqrt(2) so that the final covariance will be what we want.
+  vla = ds.VectorLaplaceLinearOperator(
+      loc=mu,
+      scale=la.LinearOperatorTriL(scale / tf.sqrt(2)))
+
+  # Covariance agrees with cholesky(cov) parameterization.
+  vla.covariance().eval()
+  # ==> [[ 0.36,  0.12,  0.06],
+  #      [ 0.12,  0.29, -0.13],
+  #      [ 0.06, -0.13,  0.26]]
+
+  # Compute the pdf of an`R^3` observation; return a scalar.
+  vla.prob([-1., 0, 1]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Vector Laplace's.
+  mu = [[1., 2, 3],
+        [11, 22, 33]]              # shape: [2, 3]
+  scale_diag = [[1., 2, 3],
+                [0.5, 1, 1.5]]     # shape: [2, 3]
+
+  vla = ds.VectorLaplaceLinearOperator(
+      loc=mu,
+      scale=la.LinearOperatorDiag(scale_diag))
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[-0.9, 0, 0.1],
+       [-10, 0, 9]]     # shape: [2, 3]
+  vla.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorLaplaceLinearOperator"):
+    """Construct Vector Laplace distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = 2 * scale @ scale.T`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale: Instance of `LinearOperator` with same `dtype` as `loc` and shape
+        `[B1, ..., Bb, k, k]`.
+      validate_args: Python `bool`, default `False`. Whether to validate input
+        with asserts. If `validate_args` is `False`, and the inputs are
+        invalid, correct behavior is not guaranteed.
+      allow_nan_stats: Python `bool`, default `True`. If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to give Ops created by the initializer.
+
+    Raises:
+      ValueError: if `scale` is unspecified.
+      TypeError: if not `scale.dtype.is_floating`
+    """
+    parameters = locals()
+    if scale is None:
+      raise ValueError("Missing required `scale` parameter.")
+    if not scale.dtype.is_floating:
+      raise TypeError("`scale` parameter must have floating-point dtype.")
+
+    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+      # Since expand_dims doesn't preserve constant-ness, we obtain the
+      # non-dynamic value if possible.
+      loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
+
+      super(VectorLaplaceLinearOperator, self).__init__(
+          distribution=laplace.Laplace(
+              loc=array_ops.zeros([], dtype=scale.dtype),
+              scale=array_ops.ones([], dtype=scale.dtype)),
+          bijector=bijectors.AffineLinearOperator(
+              shift=loc, scale=scale, validate_args=validate_args),
+          batch_shape=batch_shape,
+          event_shape=event_shape,
+          validate_args=validate_args,
+          name=name)
+      self._parameters = parameters
+
+  @property
+  def loc(self):
+    """The `loc` `Tensor` in `Y = scale @ X + loc`."""
+    return self.bijector.shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + loc`."""
+    return self.bijector.scale
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _log_prob(self, x):
+    return super(VectorLaplaceLinearOperator, self)._log_prob(x)
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _prob(self, x):
+    return super(VectorLaplaceLinearOperator, self)._prob(x)
+
+  def _mean(self):
+    shape = self.batch_shape.concatenate(self.event_shape)
+    has_static_shape = shape.is_fully_defined()
+    if not has_static_shape:
+      shape = array_ops.concat([
+          self.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], 0)
+
+    if self.loc is None:
+      return array_ops.zeros(shape, self.dtype)
+
+    if has_static_shape and shape == self.loc.get_shape():
+      return array_ops.identity(self.loc)
+
+    # Add dummy tensor of zeros to broadcast.  This is only necessary if shape
+    # != self.loc.shape, but we could not determine if this is the case.
+    return array_ops.identity(self.loc) + array_ops.zeros(shape, self.dtype)
+
+  def _covariance(self):
+    # Let
+    #   W = (w1,...,wk), with wj ~ iid Laplace(0, 1).
+    # Then this distribution is
+    #   X = loc + LW,
+    # and since E[X] = loc,
+    #   Cov(X) = E[LW W^T L^T] = L E[W W^T] L^T.
+    # Since E[wi wj] = 0 if i != j, and 2 if i == j, we have
+    #   Cov(X) = 2 LL^T
+    if distribution_util.is_diagonal_scale(self.scale):
+      return 2. * array_ops.matrix_diag(math_ops.square(self.scale.diag_part()))
+    else:
+      return 2. * self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
+
+  def _variance(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return 2. * math_ops.square(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
+          and self.scale.is_self_adjoint):
+      return array_ops.matrix_diag_part(
+          2. * self.scale.matmul(self.scale.to_dense()))
+    else:
+      return 2. * array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True))
+
+  def _stddev(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return np.sqrt(2) * math_ops.abs(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
+          and self.scale.is_self_adjoint):
+      return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense())))
+    else:
+      return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)))
+
+  def _mode(self):
+    return self._mean()
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 8d680ab27a37a205a76288eb5963f80bb6e640e5..ae804b61727b820b2af3c32f05818324bfbccf93 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -19,14 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import student_t
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import student_t
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 # TODO(jvdillon): Add unittests for this once we know where will put this code
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index 6ed2c4dfb09353ff7b023dc87b00b28df293ed43..e162a796100ae877c92932c0a805787526eb7ce0 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 import math
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.contrib.distributions.python.ops import operator_pd_full
 from tensorflow.python.framework import constant_op
@@ -35,6 +33,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 8dc78da6ba3b2d0e5512c5856b7b4266015f85e9..60e7c8f160a8e07e7c8d59ce731395586c7ab474 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -138,6 +138,7 @@ tf_py_test(
         "//tensorflow/python:platform_test",
     ],
     tags = [
+        "no_pip",  # b/38283730
         "notsan",  # Flaky: b/30756419
     ],
 )
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py
index 69572119732dba65aa4d2bdab696595afa046d6e..ead9474805c9ee2ac52cb757660a87d3cfcbb76e 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py
@@ -37,6 +37,26 @@ INPUT_MATRIX = np.array(
      [0.0, 4.1, 0.0, 0.0, 4.4, 0.0, 4.6]]).astype(np.float32)
 
 
+def remove_empty_rows_columns(np_matrix):
+  """Simple util to remove empty rows and columns of a matrix.
+
+  Args:
+    np_matrix: A numpy array.
+  Returns:
+    A tuple consisting of:
+    mat: A numpy matrix obtained by removing empty rows and columns from
+      np_matrix.
+    nz_row_ids: A numpy array of the ids of non-empty rows, such that
+      nz_row_ids[i] is the old row index corresponding to new index i.
+    nz_col_ids: A numpy array of the ids of non-empty columns, such that
+      nz_col_ids[j] is the old column index corresponding to new index j.
+  """
+  nz_row_ids = np.where(np.sum(np_matrix, axis=1) != 0)[0]
+  nz_col_ids = np.where(np.sum(np_matrix, axis=0) != 0)[0]
+  mat = np_matrix[np.ix_(nz_row_ids, nz_col_ids)]
+  return mat, nz_row_ids, nz_col_ids
+
+
 def np_matrix_to_tf_sparse(np_matrix,
                            row_slices=None,
                            col_slices=None,
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index fc5270078cd9a6a77761bb65c393926931fc0776..b092eab316664705a455b88a524a77917f141b37 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -85,7 +85,7 @@ def _init_clusters_random(data, num_clusters, random_seed):
         maxval=math_ops.cast(num_data, dtypes.int64),
         seed=random_seed,
         dtype=dtypes.int64)
-  indices = indices % math_ops.cast(num_data, dtypes.int64)
+  indices %= math_ops.cast(num_data, dtypes.int64)
   clusters_init = embedding_lookup(data, indices, partition_strategy='div')
   return clusters_init
 
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
index 9636625b106c234dc9788ebbf38e6766dc2bd5f2..2f7bf480415ddccab33b89210e2245c00f413093 100644
--- a/tensorflow/contrib/factorization/python/ops/wals.py
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -70,6 +70,7 @@ class _SweepHook(session_run_hook.SessionRunHook):
         order. These are typically local initialization ops (such as cache
         initialization).
     """
+    # TODO(walidk): Provide a counter for the number of completed sweeps.
     self._num_rows = num_rows
     self._num_cols = num_cols
     self._row_prep_ops = row_prep_ops
@@ -367,10 +368,14 @@ class WALSMatrixFactorization(estimator.Estimator):
   The current implementation assumes that the training is run on a single
   machine, and will fail if config.num_worker_replicas is not equal to one.
   Training is done by calling self.fit(input_fn=input_fn), where input_fn
-  provides two queues: one for rows of the input matrix, and one for rows of the
-  transposed input matrix (i.e. columns of the original matrix). Note that
+  provides two tensors: one for rows of the input matrix, and one for rows of
+  the transposed input matrix (i.e. columns of the original matrix). Note that
   during a row sweep, only row batches are processed (ignoring column batches)
   and vice-versa.
+  Also note that every row (respectively every column) of the input matrix
+  must be processed at least once for the sweep to be considered complete. In
+  particular, training will not make progress if input_fn does not generate some
+  rows.
 
   For prediction, given a new set of input rows A' (e.g. new rows of the A
   matrix), we compute a corresponding set of row factors U', such that U' * V^T
@@ -473,10 +478,10 @@ class WALSMatrixFactorization(estimator.Estimator):
       ValueError: If config.num_worker_replicas is strictly greater than one.
         The current implementation only supports running on a single worker.
     """
-    # TODO(walidk): Support distributed training.
     # TODO(walidk): Support power-law based weight computation.
     # TODO(walidk): Add factor lookup by indices, with caching.
     # TODO(walidk): Support caching during prediction.
+    # TODO(walidk): Provide input pipelines that handle missing rows.
 
     params = {
         "num_rows": num_rows,
diff --git a/tensorflow/contrib/factorization/python/ops/wals_test.py b/tensorflow/contrib/factorization/python/ops/wals_test.py
index 3f5787ea8719fd43babfe90830a4624b04f769b4..323b89a5cd7f1e6c4697aaec6cfae7020e516540 100644
--- a/tensorflow/contrib/factorization/python/ops/wals_test.py
+++ b/tensorflow/contrib/factorization/python/ops/wals_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.contrib.factorization.python.ops import factorization_ops_test_utils
 from tensorflow.contrib.factorization.python.ops import wals as wals_lib
 from tensorflow.contrib.learn.python.learn import run_config
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -77,10 +78,53 @@ class WALSMatrixFactorizationTest(test.TestCase):
     return sparse_tensor.SparseTensor(
         indices=new_indices, values=sp_x.values, dense_shape=shape)
 
-  # TODO(walidk): Add an option to randomize inputs.
-  def input_fn(self, np_matrix, batch_size, project_row=None,
-               projection_weights=None, col_ids=None):
-    """Returns an input_fn that selects row and col batches from np_matrix."""
+  # TODO(walidk): Add an option to shuffle inputs.
+  def input_fn(self, np_matrix, batch_size, mode,
+               project_row=None, projection_weights=None,
+               remove_empty_rows_columns=False):
+    """Returns an input_fn that selects row and col batches from np_matrix.
+
+    This simple utility creates an input function from a numpy_array. The
+    following transformations are performed:
+    * The empty rows and columns in np_matrix are removed (if
+      remove_empty_rows_columns is true)
+    * np_matrix is converted to a SparseTensor.
+    * The rows of the sparse matrix (and the rows of its transpose) are batched.
+    * A features dictionary is created, which contains the row / column batches.
+
+    In TRAIN mode, one only needs to specify the np_matrix and the batch_size.
+    In INFER and EVAL modes, one must also provide project_row, a boolean which
+    specifies whether we are projecting rows or columns.
+
+    Args:
+      np_matrix: A numpy array. The input matrix to use.
+      batch_size: Integer.
+      mode: Can be one of model_fn.ModeKeys.{TRAIN, INFER, EVAL}.
+      project_row: A boolean. Used in INFER and EVAL modes. Specifies whether
+        to project rows or columns.
+      projection_weights: A float numpy array. Used in INFER mode. Specifies
+        the weights to use in the projection (the weights are optional, and
+        default to 1.).
+      remove_empty_rows_columns: A boolean. When true, this will remove empty
+        rows and columns in the np_matrix. Note that this will result in
+        modifying the indices of the input matrix. The mapping from new indices
+        to old indices is returned in the form of two numpy arrays.
+
+    Returns:
+      A tuple consisting of:
+      _fn: A callable. Calling _fn returns a features dict.
+      nz_row_ids: A numpy array of the ids of non-empty rows, such that
+        nz_row_ids[i] is the old row index corresponding to new index i.
+      nz_col_ids: A numpy array of the ids of non-empty columns, such that
+        nz_col_ids[j] is the old column index corresponding to new index j.
+    """
+    if remove_empty_rows_columns:
+      np_matrix, nz_row_ids, nz_col_ids = (
+          factorization_ops_test_utils.remove_empty_rows_columns(np_matrix))
+    else:
+      nz_row_ids = np.arange(np.shape(np_matrix)[0])
+      nz_col_ids = np.arange(np.shape(np_matrix)[1])
+
     def extract_features(row_batch, col_batch, shape):
       row_ids = row_batch[0]
       col_ids = col_batch[0]
@@ -111,7 +155,15 @@ class WALSMatrixFactorizationTest(test.TestCase):
           enqueue_many=True)
 
       features = extract_features(row_batch, col_batch, sp_mat.dense_shape)
-      if projection_weights is not None:
+
+      if mode == model_fn.ModeKeys.INFER or mode == model_fn.ModeKeys.EVAL:
+        self.assertTrue(
+            project_row is not None,
+            msg='project_row must be specified in INFER or EVAL mode.')
+        features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = (
+            constant_op.constant(project_row))
+
+      if mode == model_fn.ModeKeys.INFER and projection_weights is not None:
         weights_batch = input_lib.batch(
             projection_weights,
             batch_size=batch_size,
@@ -119,14 +171,15 @@ class WALSMatrixFactorizationTest(test.TestCase):
             enqueue_many=True)
         features[wals_lib.WALSMatrixFactorization.PROJECTION_WEIGHTS] = (
             weights_batch)
-      if project_row is not None:
-        features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = (
-            constant_op.constant(project_row))
 
       labels = None
       return features, labels
 
-    return _fn
+    return _fn, nz_row_ids, nz_col_ids
+
+  @property
+  def input_matrix(self):
+    return self.INPUT_MATRIX
 
   @property
   def row_steps(self):
@@ -197,8 +250,10 @@ class WALSMatrixFactorizationTest(test.TestCase):
 
   def test_fit(self):
     # Row sweep.
-    input_fn = self.input_fn(np_matrix=self.INPUT_MATRIX,
-                             batch_size=self.batch_size)
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True)[0]
     self._model.fit(input_fn=input_fn, steps=self.row_steps)
     row_factors = self._model.get_row_factors()
     self.assertAllClose(row_factors[0], self._row_factors_0, atol=1e-3)
@@ -206,8 +261,10 @@ class WALSMatrixFactorizationTest(test.TestCase):
 
     # Col sweep.
     # Running fit a second time will resume training from the checkpoint.
-    input_fn = self.input_fn(np_matrix=self.INPUT_MATRIX,
-                             batch_size=self.batch_size)
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True)[0]
     self._model.fit(input_fn=input_fn, steps=self.col_steps)
     col_factors = self._model.get_col_factors()
     self.assertAllClose(col_factors[0], self._col_factors_0, atol=1e-3)
@@ -215,14 +272,18 @@ class WALSMatrixFactorizationTest(test.TestCase):
     self.assertAllClose(col_factors[2], self._col_factors_2, atol=1e-3)
 
   def test_predict(self):
-    input_fn = self.input_fn(np_matrix=self.INPUT_MATRIX,
-                             batch_size=self.batch_size)
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True,
+                            )[0]
     # Project rows 1 and 4 from the input matrix.
     proj_input_fn = self.input_fn(
         np_matrix=self.INPUT_MATRIX[[1, 4], :],
         batch_size=2,
+        mode=model_fn.ModeKeys.INFER,
         project_row=True,
-        projection_weights=[[0.2, 0.5]])
+        projection_weights=[[0.2, 0.5]])[0]
 
     self._model.fit(input_fn=input_fn, steps=self.row_steps)
     projections = self._model.get_projections(proj_input_fn)
@@ -237,8 +298,9 @@ class WALSMatrixFactorizationTest(test.TestCase):
     proj_input_fn = self.input_fn(
         np_matrix=self.INPUT_MATRIX[:, [5, 3, 1]],
         batch_size=3,
+        mode=model_fn.ModeKeys.INFER,
         project_row=False,
-        projection_weights=[[0.6, 0.4, 0.2]])
+        projection_weights=[[0.6, 0.4, 0.2]])[0]
 
     self._model.fit(input_fn=input_fn, steps=self.col_steps)
     projections = self._model.get_projections(proj_input_fn)
@@ -253,11 +315,17 @@ class WALSMatrixFactorizationTest(test.TestCase):
     # Do a row sweep then evaluate the model on row inputs.
     # The evaluate function returns the loss of the projected rows, but since
     # projection is idempotent, the eval loss must match the model loss.
-    input_fn = self.input_fn(np_matrix=self.INPUT_MATRIX,
-                             batch_size=self.batch_size)
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True,
+                            )[0]
     self._model.fit(input_fn=input_fn, steps=self.row_steps)
-    eval_input_fn_row = self.input_fn(np_matrix=self.INPUT_MATRIX, batch_size=1,
-                                      project_row=True)
+    eval_input_fn_row = self.input_fn(np_matrix=self.input_matrix,
+                                      batch_size=1,
+                                      mode=model_fn.ModeKeys.EVAL,
+                                      project_row=True,
+                                      remove_empty_rows_columns=True)[0]
     loss = self._model.evaluate(
         input_fn=eval_input_fn_row, steps=self._num_rows)['loss']
 
@@ -271,8 +339,11 @@ class WALSMatrixFactorizationTest(test.TestCase):
 
     # Do a col sweep then evaluate the model on col inputs.
     self._model.fit(input_fn=input_fn, steps=self.col_steps)
-    eval_input_fn_col = self.input_fn(np_matrix=self.INPUT_MATRIX, batch_size=1,
-                                      project_row=False)
+    eval_input_fn_col = self.input_fn(np_matrix=self.input_matrix,
+                                      batch_size=1,
+                                      mode=model_fn.ModeKeys.EVAL,
+                                      project_row=False,
+                                      remove_empty_rows_columns=True)[0]
     loss = self._model.evaluate(
         input_fn=eval_input_fn_col, steps=self._num_cols)['loss']
 
@@ -299,6 +370,16 @@ class WALSMatrixFactorizationTestFullBatch(WALSMatrixFactorizationTest):
     return 100
 
 
+class WALSMatrixFactorizaiontTestPaddedInput(WALSMatrixFactorizationTest):
+  PADDED_INPUT_MATRIX = np.pad(
+      WALSMatrixFactorizationTest.INPUT_MATRIX,
+      [(1, 0), (1, 0)], mode='constant')
+
+  @property
+  def input_matrix(self):
+    return self.PADDED_INPUT_MATRIX
+
+
 class WALSMatrixFactorizationUnsupportedTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index e520139e659dbf39a6525ce15472d3c7bce53388..a4dd3a642fdfec1aeca7b82d30ccb7b291d4bc39 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -70,8 +70,7 @@ bool IsBinaryInstalled(const string& binary_name) {
     const string binary_path = io::JoinPath(dir, binary_name);
     char absolute_path[PATH_MAX + 1];
     if (::realpath(binary_path.c_str(), absolute_path) == NULL) {
-      LOG(ERROR) << "Invalid binary path: " << binary_path;
-      return false;
+      continue;
     }
     struct stat statinfo;
     int result = ::stat(absolute_path, &statinfo);
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 40234f5b48021a6aea46573cdf3b21a112c4a878..d14fc5d6856376db9baa6d8c6599ff437fc0db54 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -40,6 +40,9 @@ See the @{$python/contrib.framework} guide.
 @@has_arg_scope
 @@arg_scoped_arguments
 
+@@prepend_name_scope
+@@strip_name_scope
+
 @@add_model_variable
 @@assert_global_step
 @@assert_or_get_global_step
@@ -82,6 +85,9 @@ from tensorflow.contrib.framework.python.framework import *
 from tensorflow.contrib.framework.python.ops import *
 # pylint: enable=unused-import,wildcard-import
 
+from tensorflow.python.framework.ops import prepend_name_scope
+from tensorflow.python.framework.ops import strip_name_scope
+
 from tensorflow.python.util.all_util import remove_undocumented
 
 
diff --git a/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py b/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
index 6455e01894861fc43033733b58991141303a2f35..280271a42dc7fc007c4c0c06b64e4532472a728c 100644
--- a/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
+++ b/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
@@ -35,8 +35,8 @@ class GridRNNCellTest(test.TestCase):
 
   def testGrid2BasicLSTMCell(self):
     with self.test_session(use_gpu=False) as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.2)) as root_scope:
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.2)) as root_scope:
         x = array_ops.zeros([1, 3])
         m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
              (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
@@ -51,21 +51,22 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-            [g, s], {x: np.array([[1., 1., 1.]]),
-                     m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
-                         (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))})
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
+        })
         self.assertEqual(res_g[0].shape, (1, 2))
         self.assertEqual(res_s[0].c.shape, (1, 2))
         self.assertEqual(res_s[0].h.shape, (1, 2))
         self.assertEqual(res_s[1].c.shape, (1, 2))
         self.assertEqual(res_s[1].h.shape, (1, 2))
 
-        self.assertAllClose(res_g, ([[0.36617181, 0.36617181]], ))
-        self.assertAllClose(res_s, (([[0.71053141, 0.71053141]],
-                                     [[0.36617181, 0.36617181]]),
-                                    ([[0.72320831, 0.80555487]],
-                                     [[0.39102408, 0.42150158]])))
+        self.assertAllClose(res_g, ([[0.36617181, 0.36617181]],))
+        self.assertAllClose(
+            res_s, (([[0.71053141, 0.71053141]], [[0.36617181, 0.36617181]]),
+                    ([[0.72320831, 0.80555487]], [[0.39102408, 0.42150158]])))
 
         # emulate a loop through the input sequence,
         # where we call cell() multiple times
@@ -78,22 +79,22 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s2[1].h.get_shape(), (1, 2))
 
         res_g2, res_s2 = sess.run([g2, s2],
-                                  {x: np.array([[2., 2., 2.]]), m: res_s})
+                                  {x: np.array([[2., 2., 2.]]),
+                                   m: res_s})
         self.assertEqual(res_g2[0].shape, (1, 2))
         self.assertEqual(res_s2[0].c.shape, (1, 2))
         self.assertEqual(res_s2[0].h.shape, (1, 2))
         self.assertEqual(res_s2[1].c.shape, (1, 2))
         self.assertEqual(res_s2[1].h.shape, (1, 2))
         self.assertAllClose(res_g2[0], [[0.58847463, 0.58847463]])
-        self.assertAllClose(res_s2, (([[1.40469193, 1.40469193]],
-                                      [[0.58847463, 0.58847463]]),
-                                     ([[0.97726452, 1.04626071]],
-                                      [[0.4927212, 0.51137757]])))
+        self.assertAllClose(
+            res_s2, (([[1.40469193, 1.40469193]], [[0.58847463, 0.58847463]]),
+                     ([[0.97726452, 1.04626071]], [[0.4927212, 0.51137757]])))
 
   def testGrid2BasicLSTMCellTied(self):
     with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
-              'root', initializer=init_ops.constant_initializer(0.2)):
+          'root', initializer=init_ops.constant_initializer(0.2)):
         x = array_ops.zeros([1, 3])
         m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
              (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
@@ -108,10 +109,12 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-            [g, s], {x: np.array([[1., 1., 1.]]),
-                     m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
-                         (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))})
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
+        })
         self.assertEqual(res_g[0].shape, (1, 2))
         self.assertEqual(res_s[0].c.shape, (1, 2))
         self.assertEqual(res_s[0].h.shape, (1, 2))
@@ -119,29 +122,27 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(res_s[1].h.shape, (1, 2))
 
         self.assertAllClose(res_g[0], [[0.36617181, 0.36617181]])
-        self.assertAllClose(res_s, (([[0.71053141, 0.71053141]],
-                                     [[0.36617181, 0.36617181]]),
-                                    ([[0.72320831, 0.80555487]],
-                                     [[0.39102408, 0.42150158]])))
+        self.assertAllClose(
+            res_s, (([[0.71053141, 0.71053141]], [[0.36617181, 0.36617181]]),
+                    ([[0.72320831, 0.80555487]], [[0.39102408, 0.42150158]])))
 
         res_g, res_s = sess.run([g, s], {x: np.array([[1., 1., 1.]]), m: res_s})
         self.assertEqual(res_g[0].shape, (1, 2))
 
         self.assertAllClose(res_g[0], [[0.36703536, 0.36703536]])
-        self.assertAllClose(res_s, (([[0.71200621, 0.71200621]],
-                                     [[0.36703536, 0.36703536]]),
-                                    ([[0.80941606, 0.87550586]],
-                                     [[0.40108523, 0.42199609]])))
+        self.assertAllClose(
+            res_s, (([[0.71200621, 0.71200621]], [[0.36703536, 0.36703536]]),
+                    ([[0.80941606, 0.87550586]], [[0.40108523, 0.42199609]])))
 
   def testGrid2BasicLSTMCellWithRelu(self):
     with self.test_session(use_gpu=False) as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.2)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.2)):
         x = array_ops.zeros([1, 3])
         m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
         cell = grid_rnn_cell.Grid2BasicLSTMCell(
             2, tied=False, non_recurrent_fn=nn_ops.relu)
-        self.assertEqual(cell.state_size, ((2, 2), ))
+        self.assertEqual(cell.state_size, ((2, 2),))
 
         g, s = cell(x, m)
         self.assertEqual(g[0].get_shape(), (1, 2))
@@ -149,21 +150,22 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-          [g, s], {x: np.array([[1., 1., 1.]]),
-                   m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])), )})
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),)
+        })
         self.assertEqual(res_g[0].shape, (1, 2))
         self.assertAllClose(res_g[0], [[0.31667367, 0.31667367]])
         self.assertAllClose(res_s, (([[0.29530135, 0.37520045]],
-                                     [[0.17044567, 0.21292259]]), ))
+                                     [[0.17044567, 0.21292259]]),))
 
   """LSTMCell
   """
 
   def testGrid2LSTMCell(self):
     with self.test_session(use_gpu=False) as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
         m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
              (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
@@ -178,10 +180,12 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-            [g, s], {x: np.array([[1., 1., 1.]]),
-                     m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
-                         (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))})
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
+        })
         self.assertEqual(res_g[0].shape, (1, 2))
         self.assertEqual(res_s[0].c.shape, (1, 2))
         self.assertEqual(res_s[0].h.shape, (1, 2))
@@ -189,15 +193,14 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(res_s[1].h.shape, (1, 2))
 
         self.assertAllClose(res_g[0], [[0.95686918, 0.95686918]])
-        self.assertAllClose(res_s, (([[2.41515064, 2.41515064]],
-                                     [[0.95686918, 0.95686918]]),
-                                    ([[1.38917875, 1.49043763]],
-                                     [[0.83884692, 0.86036491]])))
+        self.assertAllClose(
+            res_s, (([[2.41515064, 2.41515064]], [[0.95686918, 0.95686918]]),
+                    ([[1.38917875, 1.49043763]], [[0.83884692, 0.86036491]])))
 
   def testGrid2LSTMCellTied(self):
     with self.test_session(use_gpu=False) as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
         m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
              (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
@@ -212,10 +215,12 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-            [g, s], {x: np.array([[1., 1., 1.]]),
-                     m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
-                         (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))})
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
+        })
         self.assertEqual(res_g[0].shape, (1, 2))
         self.assertEqual(res_s[0].c.shape, (1, 2))
         self.assertEqual(res_s[0].h.shape, (1, 2))
@@ -223,15 +228,14 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(res_s[1].h.shape, (1, 2))
 
         self.assertAllClose(res_g[0], [[0.95686918, 0.95686918]])
-        self.assertAllClose(res_s, (([[2.41515064, 2.41515064]],
-                                     [[0.95686918, 0.95686918]]),
-                                    ([[1.38917875, 1.49043763]],
-                                     [[0.83884692, 0.86036491]])))
+        self.assertAllClose(
+            res_s, (([[2.41515064, 2.41515064]], [[0.95686918, 0.95686918]]),
+                    ([[1.38917875, 1.49043763]], [[0.83884692, 0.86036491]])))
 
   def testGrid2LSTMCellWithRelu(self):
     with self.test_session() as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
         m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
         cell = grid_rnn_cell.Grid2LSTMCell(
@@ -244,21 +248,22 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-          [g, s], {x: np.array([[1., 1., 1.]]),
-                   m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])), )})
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),)
+        })
         self.assertEqual(res_g[0].shape, (1, 2))
         self.assertAllClose(res_g[0], [[2.1831727, 2.1831727]])
         self.assertAllClose(res_s, (([[0.92270052, 1.02325559]],
-                                     [[0.66159075, 0.70475441]]), ))
+                                     [[0.66159075, 0.70475441]]),))
 
   """RNNCell
   """
 
   def testGrid2BasicRNNCell(self):
     with self.test_session() as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([2, 2])
         m = (array_ops.zeros([2, 2]), array_ops.zeros([2, 2]))
         cell = grid_rnn_cell.Grid2BasicRNNCell(2)
@@ -270,26 +275,26 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[1].get_shape(), (2, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-            [g, s], {x: np.array([[1., 1.], [2., 2.]]),
-                     m: (np.array([[0.1, 0.1], [0.2, 0.2]]),
-                         np.array([[0.1, 0.1], [0.2, 0.2]]))})
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1.], [2., 2.]]),
+            m: (np.array([[0.1, 0.1], [0.2, 0.2]]), np.array([[0.1, 0.1],
+                                                              [0.2, 0.2]]))
+        })
         self.assertEqual(res_g[0].shape, (2, 2))
         self.assertEqual(res_s[0].shape, (2, 2))
         self.assertEqual(res_s[1].shape, (2, 2))
 
         self.assertAllClose(res_g, ([[0.94685763, 0.94685763],
-                                    [0.99480951, 0.99480951]], ))
-        self.assertAllClose(res_s,
-                            ([[0.94685763, 0.94685763],
-                              [0.99480951, 0.99480951]],
-                             [[0.80049908, 0.80049908],
-                              [0.97574311, 0.97574311]]))
+                                     [0.99480951, 0.99480951]],))
+        self.assertAllClose(
+            res_s, ([[0.94685763, 0.94685763], [0.99480951, 0.99480951]],
+                    [[0.80049908, 0.80049908], [0.97574311, 0.97574311]]))
 
   def testGrid2BasicRNNCellTied(self):
     with self.test_session() as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([2, 2])
         m = (array_ops.zeros([2, 2]), array_ops.zeros([2, 2]))
         cell = grid_rnn_cell.Grid2BasicRNNCell(2, tied=True)
@@ -301,55 +306,55 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[1].get_shape(), (2, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-            [g, s], {x: np.array([[1., 1.], [2., 2.]]),
-                     m: (np.array([[0.1, 0.1], [0.2, 0.2]]),
-                         np.array([[0.1, 0.1], [0.2, 0.2]]))})
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1.], [2., 2.]]),
+            m: (np.array([[0.1, 0.1], [0.2, 0.2]]), np.array([[0.1, 0.1],
+                                                              [0.2, 0.2]]))
+        })
         self.assertEqual(res_g[0].shape, (2, 2))
         self.assertEqual(res_s[0].shape, (2, 2))
         self.assertEqual(res_s[1].shape, (2, 2))
 
         self.assertAllClose(res_g, ([[0.94685763, 0.94685763],
-                                     [0.99480951, 0.99480951]], ))
-        self.assertAllClose(res_s,
-                            ([[0.94685763, 0.94685763],
-                              [0.99480951, 0.99480951]],
-                             [[0.80049908, 0.80049908],
-                              [0.97574311, 0.97574311]]))
+                                     [0.99480951, 0.99480951]],))
+        self.assertAllClose(
+            res_s, ([[0.94685763, 0.94685763], [0.99480951, 0.99480951]],
+                    [[0.80049908, 0.80049908], [0.97574311, 0.97574311]]))
 
   def testGrid2BasicRNNCellWithRelu(self):
     with self.test_session() as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
-        m = (array_ops.zeros([1, 2]), )
-        cell = grid_rnn_cell.Grid2BasicRNNCell(
-            2, non_recurrent_fn=nn_ops.relu)
-        self.assertEqual(cell.state_size, (2, ))
+        m = (array_ops.zeros([1, 2]),)
+        cell = grid_rnn_cell.Grid2BasicRNNCell(2, non_recurrent_fn=nn_ops.relu)
+        self.assertEqual(cell.state_size, (2,))
 
         g, s = cell(x, m)
         self.assertEqual(g[0].get_shape(), (1, 2))
         self.assertEqual(s[0].get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run([g, s], {x: np.array([[1., 1.]]),
-                                         m: np.array([[0.1, 0.1]])})
+        res_g, res_s = sess.run(
+            [g, s], {x: np.array([[1., 1.]]),
+                     m: np.array([[0.1, 0.1]])})
         self.assertEqual(res_g[0].shape, (1, 2))
         self.assertEqual(res_s[0].shape, (1, 2))
-        self.assertAllClose(res_g, ([[1.80049896, 1.80049896]], ))
-        self.assertAllClose(res_s, ([[0.80049896, 0.80049896]], ))
+        self.assertAllClose(res_g, ([[1.80049896, 1.80049896]],))
+        self.assertAllClose(res_s, ([[0.80049896, 0.80049896]],))
 
   """1-LSTM
   """
 
   def testGrid1LSTMCell(self):
     with self.test_session() as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)) as root_scope:
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)) as root_scope:
         x = array_ops.zeros([1, 3])
-        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])), )
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
         cell = grid_rnn_cell.Grid1LSTMCell(2, use_peepholes=True)
-        self.assertEqual(cell.state_size, ((2, 2), ))
+        self.assertEqual(cell.state_size, ((2, 2),))
 
         g, s = cell(x, m)
         self.assertEqual(g[0].get_shape(), (1, 2))
@@ -357,17 +362,17 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-          [g, s], {x: np.array([[1., 1., 1.]]),
-                   m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])), )})
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),)
+        })
         self.assertEqual(res_g[0].shape, (1, 2))
         self.assertEqual(res_s[0].c.shape, (1, 2))
         self.assertEqual(res_s[0].h.shape, (1, 2))
 
-        self.assertAllClose(res_g, ([[0.91287315, 0.91287315]], ))
-        self.assertAllClose(res_s,
-                            (([[2.26285243, 2.26285243]],
-                              [[0.91287315, 0.91287315]]), ))
+        self.assertAllClose(res_g, ([[0.91287315, 0.91287315]],))
+        self.assertAllClose(res_s, (([[2.26285243, 2.26285243]],
+                                     [[0.91287315, 0.91287315]]),))
 
         root_scope.reuse_variables()
 
@@ -383,10 +388,9 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(res_s2[0].c.shape, (1, 2))
         self.assertEqual(res_s2[0].h.shape, (1, 2))
 
-        self.assertAllClose(res_g2, ([[0.9032144, 0.9032144]], ))
-        self.assertAllClose(res_s2,
-                            (([[2.79966092, 2.79966092]],
-                              [[0.9032144, 0.9032144]]), ))
+        self.assertAllClose(res_g2, ([[0.9032144, 0.9032144]],))
+        self.assertAllClose(res_s2, (([[2.79966092, 2.79966092]],
+                                      [[0.9032144, 0.9032144]]),))
 
         g3, s3 = cell(x2, m)
         self.assertEqual(g3[0].get_shape(), (1, 2))
@@ -398,18 +402,17 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(res_g3[0].shape, (1, 2))
         self.assertEqual(res_s3[0].c.shape, (1, 2))
         self.assertEqual(res_s3[0].h.shape, (1, 2))
-        self.assertAllClose(res_g3, ([[0.92727238, 0.92727238]], ))
-        self.assertAllClose(res_s3,
-                            (([[3.3529923, 3.3529923]],
-                              [[0.92727238, 0.92727238]]), ))
+        self.assertAllClose(res_g3, ([[0.92727238, 0.92727238]],))
+        self.assertAllClose(res_s3, (([[3.3529923, 3.3529923]],
+                                      [[0.92727238, 0.92727238]]),))
 
   """3-LSTM
   """
 
   def testGrid3LSTMCell(self):
     with self.test_session() as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
         m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
              (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
@@ -427,11 +430,13 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[2].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-          [g, s], {x: np.array([[1., 1., 1.]]),
-                   m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
-                       (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])),
-                       (np.array([[-0.1, -0.2]]), np.array([[-0.3, -0.4]])))})
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])), (np.array(
+                    [[-0.1, -0.2]]), np.array([[-0.3, -0.4]])))
+        })
         self.assertEqual(res_g[0].shape, (1, 2))
         self.assertEqual(res_s[0].c.shape, (1, 2))
         self.assertEqual(res_s[0].h.shape, (1, 2))
@@ -440,21 +445,19 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(res_s[2].c.shape, (1, 2))
         self.assertEqual(res_s[2].h.shape, (1, 2))
 
-        self.assertAllClose(res_g, ([[0.96892911, 0.96892911]], ))
-        self.assertAllClose(res_s, (([[2.45227885, 2.45227885]],
-                                     [[0.96892911, 0.96892911]]),
-                                    ([[1.33592629, 1.4373529]],
-                                     [[0.80867189, 0.83247656]]),
-                                    ([[0.7317788, 0.63205892]],
-                                     [[0.56548983, 0.50446129]])))
+        self.assertAllClose(res_g, ([[0.96892911, 0.96892911]],))
+        self.assertAllClose(
+            res_s, (([[2.45227885, 2.45227885]], [[0.96892911, 0.96892911]]),
+                    ([[1.33592629, 1.4373529]], [[0.80867189, 0.83247656]]),
+                    ([[0.7317788, 0.63205892]], [[0.56548983, 0.50446129]])))
 
   """Edge cases
   """
 
   def testGridRNNEdgeCasesLikeRelu(self):
     with self.test_session() as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([3, 2])
         m = ()
 
@@ -471,18 +474,18 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s, ())
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-          [g, s], {x: np.array([[1., -1.], [-2, 1], [2, -1]])})
+        res_g, res_s = sess.run([g, s],
+                                {x: np.array([[1., -1.], [-2, 1], [2, -1]])})
         self.assertEqual(res_g[0].shape, (3, 2))
         self.assertEqual(res_s, ())
-        self.assertAllClose(res_g, ([[0, 0], [0, 0], [0.5, 0.5]], ))
+        self.assertAllClose(res_g, ([[0, 0], [0, 0], [0.5, 0.5]],))
 
   def testGridRNNEdgeCasesNoOutput(self):
     with self.test_session() as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
-        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])), )
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
 
         # This cell produces no output
         cell = grid_rnn_cell.GridRNNCell(
@@ -498,9 +501,10 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res_g, res_s = sess.run(
-          [g, s], {x: np.array([[1., 1.]]),
-                   m: ((np.array([[0.1, 0.1]]), np.array([[0.1, 0.1]])), )})
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1.]]),
+            m: ((np.array([[0.1, 0.1]]), np.array([[0.1, 0.1]])),)
+        })
         self.assertEqual(res_g, ())
         self.assertEqual(res_s[0].c.shape, (1, 2))
         self.assertEqual(res_s[0].h.shape, (1, 2))
@@ -561,8 +565,9 @@ class GridRNNCellTest(test.TestCase):
       cell = grid_rnn_cell.Grid2LSTMCell(
           num_units=num_units, non_recurrent_fn=nn_ops.relu)
 
-      inputs = max_length * [array_ops.placeholder(
-        dtypes.float32, shape=(batch_size, input_size))]
+      inputs = max_length * [
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
+      ]
 
       outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
@@ -600,8 +605,9 @@ class GridRNNCellTest(test.TestCase):
       cell = grid_rnn_cell.Grid3LSTMCell(
           num_units=num_units, non_recurrent_fn=nn_ops.relu)
 
-      inputs = max_length * [array_ops.placeholder(
-        dtypes.float32, shape=(batch_size, input_size))]
+      inputs = max_length * [
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
+      ]
 
       outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
@@ -671,19 +677,17 @@ class GridRNNCellTest(test.TestCase):
             self.assertTrue(np.all(np.isfinite(v)))
 
   def testGrid2LSTMCellWithRNNAndDynamicBatchSize(self):
-    """Test for #4296
-    """
+    """Test for #4296."""
     input_size = 5
     max_length = 6  # unrolled up to this length
     num_units = 2
 
-    with variable_scope.variable_scope('root',
-                           initializer=init_ops.constant_initializer(0.5)):
+    with variable_scope.variable_scope(
+        'root', initializer=init_ops.constant_initializer(0.5)):
       cell = grid_rnn_cell.Grid2LSTMCell(num_units=num_units)
 
       inputs = max_length * [
-        array_ops.placeholder(
-          dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
 
       outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
@@ -700,8 +704,7 @@ class GridRNNCellTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
 
       input_value = np.ones((3, input_size))
-      values = sess.run(outputs + [state],
-                        feed_dict={inputs[0]: input_value})
+      values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
       for tp in values[:-1]:
         for v in tp:
           self.assertTrue(np.all(np.isfinite(v)))
@@ -710,18 +713,15 @@ class GridRNNCellTest(test.TestCase):
           for v in st:
             self.assertTrue(np.all(np.isfinite(v)))
 
-
   def testGrid2LSTMCellLegacy(self):
-    """Test for legacy case (when state_is_tuple=False)
-    """
+    """Test for legacy case (when state_is_tuple=False)."""
     with self.test_session() as sess:
-      with variable_scope.variable_scope('root',
-          initializer=init_ops.constant_initializer(0.5)):
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
         m = array_ops.zeros([1, 8])
-        cell = grid_rnn_cell.Grid2LSTMCell(2, use_peepholes=True,
-                                           state_is_tuple=False,
-                                           output_is_tuple=False)
+        cell = grid_rnn_cell.Grid2LSTMCell(
+            2, use_peepholes=True, state_is_tuple=False, output_is_tuple=False)
         self.assertEqual(cell.state_size, 8)
 
         g, s = cell(x, m)
@@ -729,15 +729,17 @@ class GridRNNCellTest(test.TestCase):
         self.assertEqual(s.get_shape(), (1, 8))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(
-            [g, s], {x: np.array([[1., 1., 1.]]),
-                     m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])})
+        res = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
         self.assertEqual(res[1].shape, (1, 8))
         self.assertAllClose(res[0], [[0.95686918, 0.95686918]])
-        self.assertAllClose(res[1], [[2.41515064, 2.41515064, 0.95686918,
-                                      0.95686918, 1.38917875, 1.49043763,
-                                      0.83884692, 0.86036491]])
+        self.assertAllClose(res[1], [[
+            2.41515064, 2.41515064, 0.95686918, 0.95686918, 1.38917875,
+            1.49043763, 0.83884692, 0.86036491
+        ]])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
index 269ad0a384990f3edfd668111df57084350b67d3..252788140f8c1906718c150574b963385b6ecfa1 100644
--- a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
+++ b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
@@ -102,16 +102,16 @@ class GridRNNCell(rnn.RNNCell):
       output_is_tuple: If True, the output is a tuple of the outputs of the
         recurrent dimensions. If False, they are concatenated along the
         column axis. The later behavior will soon be deprecated.
-        
+
     Raises:
       TypeError: if cell_fn does not return an RNNCell instance.
     """
     if not state_is_tuple:
-      logging.warning("%s: Using a concatenated state is slower and will "
-                      "soon be deprecated.  Use state_is_tuple=True.", self)
+      logging.warning('%s: Using a concatenated state is slower and will '
+                      'soon be deprecated.  Use state_is_tuple=True.', self)
     if not output_is_tuple:
-      logging.warning("%s: Using a concatenated output is slower and will"
-                      "soon be deprecated.  Use output_is_tuple=True.", self)
+      logging.warning('%s: Using a concatenated output is slower and will'
+                      'soon be deprecated.  Use output_is_tuple=True.', self)
 
     if num_dims < 1:
       raise ValueError('dims must be >= 1: {}'.format(num_dims))
@@ -126,9 +126,7 @@ class GridRNNCell(rnn.RNNCell):
 
     if cell_fn is None:
       my_cell_fn = functools.partial(
-        rnn.LSTMCell,
-        num_units=num_units,
-        state_is_tuple=state_is_tuple)
+          rnn.LSTMCell, num_units=num_units, state_is_tuple=state_is_tuple)
     else:
       my_cell_fn = lambda: cell_fn(num_units)
     if tied:
@@ -136,9 +134,8 @@ class GridRNNCell(rnn.RNNCell):
     else:
       self._cells = [my_cell_fn() for _ in range(num_dims)]
     if not isinstance(self._cells[0], rnn.RNNCell):
-      raise TypeError(
-        'cell_fn must return an RNNCell instance, saw: %s'
-        % type(self._cells[0]))
+      raise TypeError('cell_fn must return an RNNCell instance, saw: %s' %
+                      type(self._cells[0]))
 
     if self._output_is_tuple:
       self._output_size = tuple(self._cells[0].output_size
@@ -201,26 +198,36 @@ class GridRNNCell(rnn.RNNCell):
       if self._output_is_tuple:
         output = tuple(output_tensors)
       else:
-        if len(output_tensors) == 0:
-          output = array_ops.zeros([0, 0], dtype)
-        else:
+        if output_tensors:
           output = array_ops.concat(output_tensors, 1)
+        else:
+          output = array_ops.zeros([0, 0], dtype)
 
       if self._state_is_tuple:
         states = tuple(new_state[i] for i in self._config.recurrents)
       else:
         # concat each state first, then flatten the whole thing
-        state_tensors = [x for i in self._config.recurrents
-                         for x in new_state[i]]
-        if len(state_tensors) == 0:
-          states = array_ops.zeros([0, 0], dtype)
-        else:
+        state_tensors = [
+            x for i in self._config.recurrents for x in new_state[i]
+        ]
+        if state_tensors:
           states = array_ops.concat(state_tensors, 1)
+        else:
+          states = array_ops.zeros([0, 0], dtype)
 
     return output, states
 
   def _extract_states(self, state):
-    """Extract the cell and previous output tensors from the given state
+    """Extract the cell and previous output tensors from the given state.
+
+    Args:
+      state: The RNN state.
+
+    Returns:
+      Tuple of the cell value, previous output, and cell_output_size.
+
+    Raises:
+      ValueError: If len(self._config.recurrents) != len(state).
     """
     conf = self._config
 
@@ -238,8 +245,8 @@ class GridRNNCell(rnn.RNNCell):
 
     if self._state_is_tuple:
       if len(conf.recurrents) != len(state):
-        raise ValueError("Expected state as a tuple of {} "
-                         "element".format(len(conf.recurrents)))
+        raise ValueError('Expected state as a tuple of {} '
+                         'element'.format(len(conf.recurrents)))
 
       for recurrent_dim, recurrent_state in zip(conf.recurrents, state):
         if cell_output_size > 0:
@@ -247,49 +254,62 @@ class GridRNNCell(rnn.RNNCell):
         else:
           m_prev[recurrent_dim] = recurrent_state
     else:
-      for recurrent_dim, start_idx in zip(conf.recurrents, range(
-          0, self.state_size, total_cell_state_size)):
+      for recurrent_dim, start_idx in zip(conf.recurrents,
+                                          range(0, self.state_size,
+                                                total_cell_state_size)):
         if cell_output_size > 0:
           c_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
                                                   [-1, conf.num_units])
           m_prev[recurrent_dim] = array_ops.slice(
-            state, [0, start_idx + conf.num_units], [-1, cell_output_size])
+              state, [0, start_idx + conf.num_units], [-1, cell_output_size])
         else:
           m_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
                                                   [-1, conf.num_units])
     return c_prev, m_prev, cell_output_size
 
   def _project_input(self, inputs, c_prev, m_prev, with_c):
-    """Fills in c_prev and m_prev with projected input, for input dimensions
+    """Fills in c_prev and m_prev with projected input, for input dimensions.
+
+    Args:
+      inputs: inputs tensor
+      c_prev: cell value
+      m_prev: previous output
+      with_c: boolean; whether to include project_c.
+
+    Raises:
+      ValueError: if len(self._config.input) != len(inputs)
     """
     conf = self._config
 
-    if (inputs is not None and inputs.get_shape().with_rank(2)[1].value > 0
-        and len(conf.inputs) > 0):
+    if (inputs is not None and inputs.get_shape().with_rank(2)[1].value > 0 and
+        conf.inputs):
       if isinstance(inputs, tuple):
         if len(conf.inputs) != len(inputs):
-          raise ValueError("Expect inputs as a tuple of {} "
-                           "tensors".format(len(conf.inputs)))
+          raise ValueError('Expect inputs as a tuple of {} '
+                           'tensors'.format(len(conf.inputs)))
         input_splits = inputs
       else:
         input_splits = array_ops.split(
-          value=inputs, num_or_size_splits=len(conf.inputs), axis=1)
+            value=inputs, num_or_size_splits=len(conf.inputs), axis=1)
       input_sz = input_splits[0].get_shape().with_rank(2)[1].value
 
       for i, j in enumerate(conf.inputs):
         input_project_m = vs.get_variable(
-          'project_m_{}'.format(j), [input_sz, conf.num_units],
-          dtype=inputs.dtype)
+            'project_m_{}'.format(j), [input_sz, conf.num_units],
+            dtype=inputs.dtype)
         m_prev[j] = math_ops.matmul(input_splits[i], input_project_m)
 
         if with_c:
           input_project_c = vs.get_variable(
-            'project_c_{}'.format(j), [input_sz, conf.num_units],
-            dtype=inputs.dtype)
+              'project_c_{}'.format(j), [input_sz, conf.num_units],
+              dtype=inputs.dtype)
           c_prev[j] = math_ops.matmul(input_splits[i], input_project_c)
 
   def _cell_state_size(self):
-    """Total size of the state of the inner cell used in this grid
+    """Total size of the state of the inner cell used in this grid.
+
+    Returns:
+      Total size of the state of the inner cell.
     """
     state_sizes = self._cells[0].state_size
     if isinstance(state_sizes, tuple):
@@ -306,10 +326,15 @@ class Grid1BasicRNNCell(GridRNNCell):
 
   def __init__(self, num_units, state_is_tuple=True, output_is_tuple=True):
     super(Grid1BasicRNNCell, self).__init__(
-      num_units=num_units, num_dims=1,
-      input_dims=0, output_dims=0, priority_dims=0, tied=False,
-      cell_fn=lambda n: rnn.BasicRNNCell(num_units=n),
-      state_is_tuple=state_is_tuple, output_is_tuple=output_is_tuple)
+        num_units=num_units,
+        num_dims=1,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=False,
+        cell_fn=lambda n: rnn.BasicRNNCell(num_units=n),
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2BasicRNNCell(GridRNNCell):
@@ -322,38 +347,56 @@ class Grid2BasicRNNCell(GridRNNCell):
   specified.
   """
 
-  def __init__(self, num_units, tied=False, non_recurrent_fn=None,
-               state_is_tuple=True, output_is_tuple=True):
+  def __init__(self,
+               num_units,
+               tied=False,
+               non_recurrent_fn=None,
+               state_is_tuple=True,
+               output_is_tuple=True):
     super(Grid2BasicRNNCell, self).__init__(
-      num_units=num_units, num_dims=2,
-      input_dims=0, output_dims=0, priority_dims=0, tied=tied,
-      non_recurrent_dims=None if non_recurrent_fn is None else 0,
-      cell_fn=lambda n: rnn.BasicRNNCell(num_units=n),
-      non_recurrent_fn=non_recurrent_fn,
-      state_is_tuple=state_is_tuple, output_is_tuple=output_is_tuple)
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
+        non_recurrent_dims=None if non_recurrent_fn is None else 0,
+        cell_fn=lambda n: rnn.BasicRNNCell(num_units=n),
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid1BasicLSTMCell(GridRNNCell):
-  """1D BasicLSTM cell"""
+  """1D BasicLSTM cell."""
 
-  def __init__(self, num_units, forget_bias=1,
-               state_is_tuple=True, output_is_tuple=True):
+  def __init__(self,
+               num_units,
+               forget_bias=1,
+               state_is_tuple=True,
+               output_is_tuple=True):
+    def cell_fn(n):
+      return rnn.BasicLSTMCell(num_units=n, forget_bias=forget_bias)
     super(Grid1BasicLSTMCell, self).__init__(
-      num_units=num_units, num_dims=1,
-      input_dims=0, output_dims=0, priority_dims=0, tied=False,
-      cell_fn=lambda n: rnn.BasicLSTMCell(
-        num_units=n, forget_bias=forget_bias),
-      state_is_tuple=state_is_tuple, output_is_tuple=output_is_tuple)
+        num_units=num_units,
+        num_dims=1,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=False,
+        cell_fn=cell_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2BasicLSTMCell(GridRNNCell):
-  """2D BasicLSTM cell
+  """2D BasicLSTM cell.
 
-    This creates a 2D cell which receives input and gives output in the first
-    dimension.
+  This creates a 2D cell which receives input and gives output in the first
+  dimension.
 
-    The first dimension can optionally be non-recurrent if `non_recurrent_fn` is
-    specified.
+  The first dimension can optionally be non-recurrent if `non_recurrent_fn` is
+  specified.
   """
 
   def __init__(self,
@@ -363,36 +406,53 @@ class Grid2BasicLSTMCell(GridRNNCell):
                forget_bias=1,
                state_is_tuple=True,
                output_is_tuple=True):
+    def cell_fn(n):
+      return rnn.BasicLSTMCell(num_units=n, forget_bias=forget_bias)
     super(Grid2BasicLSTMCell, self).__init__(
-      num_units=num_units, num_dims=2,
-      input_dims=0, output_dims=0, priority_dims=0, tied=tied,
-      non_recurrent_dims=None if non_recurrent_fn is None else 0,
-      cell_fn=lambda n: rnn.BasicLSTMCell(
-        num_units=n, forget_bias=forget_bias),
-      non_recurrent_fn=non_recurrent_fn,
-      state_is_tuple=state_is_tuple, output_is_tuple=output_is_tuple)
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
+        non_recurrent_dims=None if non_recurrent_fn is None else 0,
+        cell_fn=cell_fn,
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid1LSTMCell(GridRNNCell):
-  """1D LSTM cell
+  """1D LSTM cell.
 
-    This is different from Grid1BasicLSTMCell because it gives options to
-    specify the forget bias and enabling peepholes
+  This is different from Grid1BasicLSTMCell because it gives options to
+  specify the forget bias and enabling peepholes.
   """
 
-  def __init__(self, num_units, use_peepholes=False, forget_bias=1.0,
-               state_is_tuple=True, output_is_tuple=True):
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               output_is_tuple=True):
+
+    def cell_fn(n):
+      return rnn.LSTMCell(
+          num_units=n, forget_bias=forget_bias, use_peepholes=use_peepholes)
+
     super(Grid1LSTMCell, self).__init__(
-      num_units=num_units, num_dims=1,
-      input_dims=0, output_dims=0, priority_dims=0,
-      cell_fn=lambda n: rnn.LSTMCell(
-        num_units=n, use_peepholes=use_peepholes,
-        forget_bias=forget_bias),
-      state_is_tuple=state_is_tuple, output_is_tuple=output_is_tuple)
+        num_units=num_units,
+        num_dims=1,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        cell_fn=cell_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2LSTMCell(GridRNNCell):
-  """2D LSTM cell
+  """2D LSTM cell.
 
     This creates a 2D cell which receives input and gives output in the first
     dimension.
@@ -408,19 +468,27 @@ class Grid2LSTMCell(GridRNNCell):
                forget_bias=1.0,
                state_is_tuple=True,
                output_is_tuple=True):
+
+    def cell_fn(n):
+      return rnn.LSTMCell(
+          num_units=n, forget_bias=forget_bias, use_peepholes=use_peepholes)
+
     super(Grid2LSTMCell, self).__init__(
-      num_units=num_units, num_dims=2,
-      input_dims=0, output_dims=0, priority_dims=0, tied=tied,
-      non_recurrent_dims=None if non_recurrent_fn is None else 0,
-      cell_fn=lambda n: rnn.LSTMCell(
-        num_units=n, forget_bias=forget_bias,
-        use_peepholes=use_peepholes),
-      non_recurrent_fn=non_recurrent_fn,
-      state_is_tuple=state_is_tuple, output_is_tuple=output_is_tuple)
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
+        non_recurrent_dims=None if non_recurrent_fn is None else 0,
+        cell_fn=cell_fn,
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid3LSTMCell(GridRNNCell):
-  """3D BasicLSTM cell
+  """3D BasicLSTM cell.
 
     This creates a 2D cell which receives input and gives output in the first
     dimension.
@@ -437,19 +505,27 @@ class Grid3LSTMCell(GridRNNCell):
                forget_bias=1.0,
                state_is_tuple=True,
                output_is_tuple=True):
+
+    def cell_fn(n):
+      return rnn.LSTMCell(
+          num_units=n, forget_bias=forget_bias, use_peepholes=use_peepholes)
+
     super(Grid3LSTMCell, self).__init__(
-      num_units=num_units, num_dims=3,
-      input_dims=0, output_dims=0, priority_dims=0, tied=tied,
-      non_recurrent_dims=None if non_recurrent_fn is None else 0,
-      cell_fn=lambda n: rnn.LSTMCell(
-        num_units=n, forget_bias=forget_bias,
-        use_peepholes=use_peepholes),
-      non_recurrent_fn=non_recurrent_fn,
-      state_is_tuple=state_is_tuple, output_is_tuple=output_is_tuple)
+        num_units=num_units,
+        num_dims=3,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
+        non_recurrent_dims=None if non_recurrent_fn is None else 0,
+        cell_fn=cell_fn,
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2GRUCell(GridRNNCell):
-  """2D LSTM cell
+  """2D LSTM cell.
 
     This creates a 2D cell which receives input and gives output in the first
     dimension.
@@ -457,23 +533,31 @@ class Grid2GRUCell(GridRNNCell):
     specified.
   """
 
-  def __init__(self, num_units, tied=False, non_recurrent_fn=None,
-               state_is_tuple=True, output_is_tuple=True):
+  def __init__(self,
+               num_units,
+               tied=False,
+               non_recurrent_fn=None,
+               state_is_tuple=True,
+               output_is_tuple=True):
     super(Grid2GRUCell, self).__init__(
-      num_units=num_units, num_dims=2,
-      input_dims=0, output_dims=0, priority_dims=0, tied=tied,
-      non_recurrent_dims=None if non_recurrent_fn is None else 0,
-      cell_fn=lambda n: rnn.GRUCell(num_units=n),
-      non_recurrent_fn=non_recurrent_fn,
-      state_is_tuple=state_is_tuple, output_is_tuple=output_is_tuple)
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
+        non_recurrent_dims=None if non_recurrent_fn is None else 0,
+        cell_fn=lambda n: rnn.GRUCell(num_units=n),
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
-"""Helpers
-"""
+# Helpers
 
-_GridRNNDimension = namedtuple(
-  '_GridRNNDimension',
-  ['idx', 'is_input', 'is_output', 'is_priority', 'non_recurrent_fn'])
+_GridRNNDimension = namedtuple('_GridRNNDimension', [
+    'idx', 'is_input', 'is_output', 'is_priority', 'non_recurrent_fn'
+])
 
 _GridRNNConfig = namedtuple('_GridRNNConfig',
                             ['num_dims', 'dims', 'inputs', 'outputs',
@@ -502,23 +586,23 @@ def _parse_rnn_config(num_dims, ls_input_dims, ls_output_dims, ls_priority_dims,
   rnn_dims = []
   for i in range(num_dims):
     rnn_dims.append(
-      _GridRNNDimension(
-        idx=i,
-        is_input=(i in input_dims),
-        is_output=(i in output_dims),
-        is_priority=(i in priority_dims),
-        non_recurrent_fn=non_recurrent_fn if i in non_recurrent_dims else
-        None))
+        _GridRNNDimension(
+            idx=i,
+            is_input=(i in input_dims),
+            is_output=(i in output_dims),
+            is_priority=(i in priority_dims),
+            non_recurrent_fn=non_recurrent_fn
+            if i in non_recurrent_dims else None))
   return _GridRNNConfig(
-    num_dims=num_dims,
-    dims=rnn_dims,
-    inputs=input_dims,
-    outputs=output_dims,
-    recurrents=[x for x in range(num_dims) if x not in non_recurrent_dims],
-    priority=priority_dims,
-    non_priority=[x for x in range(num_dims) if x not in priority_dims],
-    tied=tied,
-    num_units=num_units)
+      num_dims=num_dims,
+      dims=rnn_dims,
+      inputs=input_dims,
+      outputs=output_dims,
+      recurrents=[x for x in range(num_dims) if x not in non_recurrent_dims],
+      priority=priority_dims,
+      non_priority=[x for x in range(num_dims) if x not in priority_dims],
+      tied=tied,
+      num_units=num_units)
 
 
 def _propagate(dim_indices, conf, cells, c_prev, m_prev, new_output, new_state,
@@ -544,8 +628,8 @@ def _propagate(dim_indices, conf, cells, c_prev, m_prev, new_output, new_state,
     cell_inputs = array_ops.zeros([m_prev[0].get_shape().as_list()[0], 0],
                                   m_prev[0].dtype)
 
-  last_dim_output = (new_output[-1] if new_output[-1] is not None
-                     else m_prev[-1])
+  last_dim_output = (new_output[-1]
+                     if new_output[-1] is not None else m_prev[-1])
 
   for i in dim_indices:
     d = conf.dims[i]
@@ -560,12 +644,12 @@ def _propagate(dim_indices, conf, cells, c_prev, m_prev, new_output, new_state,
           vs.get_variable_scope().reuse_variables()
 
         new_output[d.idx] = layers.fully_connected(
-          linear_args,
-          num_outputs=conf.num_units,
-          activation_fn=d.non_recurrent_fn,
-          weights_initializer=vs.get_variable_scope().initializer or
-                              layers.initializers.xavier_initializer,
-          weights_regularizer=vs.get_variable_scope().regularizer)
+            linear_args,
+            num_outputs=conf.num_units,
+            activation_fn=d.non_recurrent_fn,
+            weights_initializer=(vs.get_variable_scope().initializer or
+                                 layers.initializers.xavier_initializer),
+            weights_regularizer=vs.get_variable_scope().regularizer)
     else:
       if c_prev[i] is not None:
         cell_state = (c_prev[i], last_dim_output)
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
index e905d4b32249b4a1542092ee550cc72e7f0e7050..3a219bb3e6f185e66e15bba729780ecf53cf62ba 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -36,6 +37,9 @@ static int ParseFlags(int argc, char* argv[], string* in_graph) {
       Flag("in_graph", in_graph, "input graph file name"),
   };
   CHECK(Flags::Parse(&argc, argv, flag_list));
+  // We need to call this to set up global state for TensorFlow.
+  port::InitMain(argv[0], &argc, &argv);
+
   string usage = Flags::Usage(argv[0], flag_list);
   CHECK(!in_graph->empty()) << "in_graph graph can't be empty.\n" << usage;
 
@@ -50,7 +54,8 @@ static void CheckOpsSupport(const GraphDef& graph_def) {
   std::unordered_set<string> unsupported_ops;
   bool all_supported = true;
   for (const NodeDef& node : graph_def.node()) {
-    const int op_id = ops_definition.GetOpIdFor(node.op());
+    // TODO(satok): Set correct data type if it's given.
+    const int op_id = ops_definition.GetOpIdFor(node.op(), {});
     if (op_id == IGraphTransferOpsDefinitions::INVALID_OP_ID) {
       all_supported = false;
       LOG(ERROR) << "OP type: " << node.op() << " is not supported on hvx. "
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index 7599406acaf6d909cea55a9088031976e1e2e560..a095f0e048a9b5831222d153a6c84cfa8ccd0dce 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -20,6 +20,7 @@ load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 tf_custom_op_library(
     name = "python/ops/_image_ops.so",
     srcs = [
+        "kernels/bipartite_match_op.cc",
         "kernels/image_ops.cc",
         "kernels/image_ops.h",
         "ops/image_ops.cc",
@@ -33,6 +34,7 @@ tf_custom_op_library(
 tf_kernel_library(
     name = "image_ops_kernels",
     srcs = [
+        "kernels/bipartite_match_op.cc",
         "kernels/image_ops.cc",
         "kernels/image_ops.h",
     ],
diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
index aa70d42339d67c767ec459698c80e79ee795db42..fee1a6c2bc951214cb64a9abf80b584ee839c5b0 100755
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -25,6 +25,7 @@ projective transforms (including rotation) are supported.
 @@compose_transforms
 @@rotate
 @@transform
+@@bipartite_match
 @@single_image_random_dot_stereograms
 """
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/image/kernels/bipartite_match_op.cc b/tensorflow/contrib/image/kernels/bipartite_match_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d207c388b159c4ad0f25032811e97b153fd50d6
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/bipartite_match_op.cc
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <queue>
+#include <vector>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace {
+
+struct DistancePair {
+  DistancePair(int i1, int i2, double d) : index1(i1), index2(i2), dist(d) {}
+
+  bool operator<(const DistancePair& b1) const { return b1.dist < dist; }
+
+  int index1, index2;
+  float dist;
+};
+
+}  // namespace
+
+namespace tensorflow {
+
+class BipartiteMatchOp : public OpKernel {
+ public:
+  explicit BipartiteMatchOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("top_k", &top_k_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_distance_mat = context->input(0);
+    OP_REQUIRES(context, input_distance_mat.dims() == 2,
+                errors::InvalidArgument(
+                    "distance_mat should be 2-dimensional, but got ",
+                    input_distance_mat.shape().DebugString()));
+    const int num_input_rows = input_distance_mat.dim_size(0);
+    const int num_input_columns = input_distance_mat.dim_size(1);
+
+    const Tensor& input_num_valid_rows = context->input(1);
+    OP_REQUIRES(
+        context, input_num_valid_rows.NumElements() == 1,
+        errors::InvalidArgument(
+            "num_valid_rows argument should be a tensor with 1 element, "
+            "but got ",
+            input_num_valid_rows.NumElements()));
+
+    const float num_valid_rows_f = input_num_valid_rows.flat<float>()(0);
+    int num_valid_rows = num_input_rows;
+    // If num_valid_rows_f is non-negative, use it to set num_valid_rows.
+    if (num_valid_rows_f >= 0) {
+      num_valid_rows = static_cast<int>(num_valid_rows_f + 0.1);
+    }
+    OP_REQUIRES(
+        context, num_input_rows >= num_valid_rows,
+        errors::InvalidArgument("There should be at least ", num_valid_rows,
+                                " rows in distance_mat, but only got ",
+                                num_input_rows, " rows."));
+
+    // If negative or zero then set it to the maximum possible matches.
+    auto valid_top_k = top_k_;
+
+    if (valid_top_k <= 0) {
+      valid_top_k = num_valid_rows * num_input_columns;
+    }
+
+    // Create output tensors.
+    Tensor* row_to_column_match_indices = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({num_input_rows}),
+                                            &row_to_column_match_indices));
+    Tensor* column_to_row_match_indices = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, TensorShape({num_input_columns}),
+                                            &column_to_row_match_indices));
+
+    typename TTypes<float, 2>::ConstTensor distance_mat =
+        input_distance_mat.shaped<float, 2>(
+            {num_input_rows, num_input_columns});
+
+    // Greedy bi-partite matching.
+    std::priority_queue<DistancePair> match_queue;
+
+    for (int index1 = 0; index1 < num_valid_rows; index1++) {
+      for (int index2 = 0; index2 < num_input_columns; index2++) {
+        match_queue.push(
+            DistancePair(index1, index2, distance_mat(index1, index2)));
+      }
+    }
+
+    std::vector<int> row_to_col_match_vec(num_input_rows, -1);
+    std::vector<int> col_to_row_match_vec(num_input_columns, -1);
+    int index = 0;
+    while (!match_queue.empty()) {
+      const auto& match = match_queue.top();
+      if (row_to_col_match_vec[match.index1] == -1 &&
+          col_to_row_match_vec[match.index2] == -1) {
+        row_to_col_match_vec[match.index1] = match.index2;
+        col_to_row_match_vec[match.index2] = match.index1;
+
+        index++;
+        if (index >= valid_top_k) {
+          break;
+        }
+      }
+      match_queue.pop();
+    }
+
+    // Set the output tensors.
+    row_to_column_match_indices->vec<int>() =
+        TTypes<int>::Vec(row_to_col_match_vec.data(), num_input_rows);
+    column_to_row_match_indices->vec<int>() =
+        TTypes<int>::Vec(col_to_row_match_vec.data(), num_input_columns);
+  }
+
+ private:
+  int top_k_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BipartiteMatch").Device(DEVICE_CPU),
+                        BipartiteMatchOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 8d50541771b55c5958674c489329202f3da207b3..8a97f07732c4be43192f6ea8f6934118b49875f8 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -43,13 +43,29 @@ template class FillProjectiveTransform<CPUDevice, double>;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 using functor::FillProjectiveTransform;
+using generator::INTERPOLATION_BILINEAR;
+using generator::INTERPOLATION_NEAREST;
+using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
 class ImageProjectiveTransform : public OpKernel {
+ private:
+  Interpolation interpolation_;
+
  public:
-  explicit ImageProjectiveTransform(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
+  explicit ImageProjectiveTransform(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string interpolation_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("interpolation", &interpolation_str));
+    if (interpolation_str == "NEAREST") {
+      interpolation_ = INTERPOLATION_NEAREST;
+    } else if (interpolation_str == "BILINEAR") {
+      interpolation_ = INTERPOLATION_BILINEAR;
+    } else {
+      LOG(FATAL) << "Invalid interpolation " << interpolation_str
+                 << ". Supported types: NEAREST, BILINEAR";
+    }
+  }
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
@@ -68,8 +84,8 @@ class ImageProjectiveTransform : public OpKernel {
     Tensor* output_t;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
     auto output = output_t->tensor<T, 4>();
-    const FillProjectiveTransform<Device, T> functor;
-    functor(ctx->eigen_device<Device>(), &output, images, transform);
+    (FillProjectiveTransform<Device, T>(interpolation_))(
+        ctx->eigen_device<Device>(), &output, images, transform);
   }
 };
 
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index 92b908a1c68ef7db173410243a5f900110e81f4d..692e33fcf30b5b3b323ef26fab0c88fbfaab0f20 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -28,6 +28,8 @@ namespace tensorflow {
 
 namespace generator {
 
+enum Interpolation { INTERPOLATION_NEAREST, INTERPOLATION_BILINEAR };
+
 using Eigen::array;
 using Eigen::DenseIndex;
 
@@ -36,20 +38,19 @@ class ProjectiveGenerator {
  private:
   typename TTypes<T, 4>::ConstTensor input_;
   typename TTypes<float>::ConstMatrix transforms_;
+  const Interpolation interpolation_;
 
  public:
   static const int kNumParameters = 8;
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   ProjectiveGenerator(typename TTypes<T, 4>::ConstTensor input,
-                      typename TTypes<float>::ConstMatrix transforms)
-      : input_(input), transforms_(transforms) {}
+                      typename TTypes<float>::ConstMatrix transforms,
+                      const Interpolation interpolation)
+      : input_(input), transforms_(transforms), interpolation_(interpolation) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
   operator()(const array<DenseIndex, 4>& coords) const {
-    array<DenseIndex, 4> input_coords;
-    input_coords[0] = coords[0];
-
     const int64 output_y = coords[1];
     const int64 output_x = coords[2];
     const float* transform =
@@ -57,24 +58,73 @@ class ProjectiveGenerator {
             ? transforms_.data()
             : &transforms_.data()[transforms_.dimension(1) * coords[0]];
     float projection = transform[6] * output_x + transform[7] * output_y + 1.f;
-    const int64 input_x = std::round(
+    const float input_x =
         (transform[0] * output_x + transform[1] * output_y + transform[2]) /
-        projection);
-    const int64 input_y = std::round(
+        projection;
+    const float input_y =
         (transform[3] * output_x + transform[4] * output_y + transform[5]) /
-        projection);
-
-    if (!(0 <= input_y && input_y < input_.dimension(1) && 0 <= input_x &&
-          input_x < input_.dimension(2))) {
-      // TODO(ringwalt): Add a fill value input.
-      return T(0);
+        projection;
+
+    // TODO(ringwalt): Add a fill value input.
+    static const T fill_value = T(0);
+    switch (interpolation_) {
+      case INTERPOLATION_NEAREST:
+        // Switch the order of x and y again for indexing into the image.
+        return nearest_interpolation(coords[0], input_y, input_x, coords[3],
+                                     fill_value);
+      case INTERPOLATION_BILINEAR:
+        return bilinear_interpolation(coords[0], input_y, input_x, coords[3],
+                                      fill_value);
     }
-    input_coords[1] = input_y;
-    input_coords[2] = input_x;
+  }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  nearest_interpolation(const DenseIndex batch, const float y, const float x,
+                        const DenseIndex channel, const T fill_value) const {
+    return read_with_fill_value(batch, DenseIndex(std::round(y)),
+                                DenseIndex(std::round(x)), channel, fill_value);
+  }
 
-    input_coords[3] = coords[3];
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  bilinear_interpolation(const DenseIndex batch, const float y, const float x,
+                         const DenseIndex channel, const T fill_value) const {
+    const float y_floor = std::floor(y);
+    const float x_floor = std::floor(x);
+    const float y_ceil = y_floor + 1;
+    const float x_ceil = x_floor + 1;
+    // f(x, y_floor) = (x_ceil - x) / (x_ceil - x_floor) * f(x_floor, y_floor)
+    //               + (x - x_floor) / (x_ceil - x_floor) * f(x_ceil, y_floor)
+    const float value_yfloor =
+        (x_ceil - x) * read_with_fill_value(batch, DenseIndex(y_floor),
+                                            DenseIndex(x_floor), channel,
+                                            fill_value) +
+        (x - x_floor) * read_with_fill_value(batch, DenseIndex(y_floor),
+                                             DenseIndex(x_ceil), channel,
+                                             fill_value);
+    // f(x, y_ceil) = (x_ceil - x) / (x_ceil - x_floor) * f(x_floor, y_ceil)
+    //              + (x - x_floor) / (x_ceil - x_floor) * f(x_ceil, y_ceil)
+    const float value_yceil =
+        (x_ceil - x) * read_with_fill_value(batch, DenseIndex(y_ceil),
+                                            DenseIndex(x_floor), channel,
+                                            fill_value) +
+        (x - x_floor) * read_with_fill_value(batch, DenseIndex(y_ceil),
+                                             DenseIndex(x_ceil), channel,
+                                             fill_value);
+    // f(x, y) = (y_ceil - y) / (y_ceil - y_floor) * f(x, y_floor)
+    //         + (y - y_floor) / (y_ceil - y_floor) * f(x, y_ceil)
+    return T((y_ceil - y) * value_yfloor + (y - y_floor) * value_yceil);
+  }
 
-    return input_(input_coords);
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T read_with_fill_value(
+      const DenseIndex batch, const DenseIndex y, const DenseIndex x,
+      const DenseIndex channel, const T fill_value) const {
+    // batch and channel must be correct, because they are passed unchanged from
+    // the input.
+    return (0 <= y && y < input_.dimension(1) && 0 <= x &&
+            x < input_.dimension(2))
+               ? input_(array<DenseIndex, 4>{batch, y, x, channel})
+               : fill_value;
   }
 };
 
@@ -85,6 +135,7 @@ class ProjectiveGenerator {
 // some Eigen device code.
 namespace functor {
 
+using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
@@ -92,15 +143,17 @@ struct FillProjectiveTransform {
   typedef typename TTypes<T, 4>::Tensor OutputType;
   typedef typename TTypes<T, 4>::ConstTensor InputType;
   typedef typename TTypes<float, 2>::ConstTensor TransformsType;
+  const Interpolation interpolation_;
 
-  FillProjectiveTransform() {}
+  FillProjectiveTransform(Interpolation interpolation)
+      : interpolation_(interpolation) {}
 
   EIGEN_ALWAYS_INLINE
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    ProjectiveGenerator<Device, T> generator(images, transform);
-    output->device(device) = images.generate(generator);
+    output->device(device) = images.generate(
+        ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
 
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 18c16cf1bb62fff8920109344cf6052296000a4c..6b24eaf2a5ea71906c9a4d4244493e4326a90d92 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -20,16 +20,17 @@ limitations under the License.
 namespace tensorflow {
 
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
-// TODO(ringwalt): Add an "interpolation" argument with "none", "bilinear", etc.
 // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
 // implement "same" and "valid" modes in the Python function.
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
     .Attr("dtype: {uint8, int32, int64, float32, float64}")
+    .Attr("interpolation: string")
     .Output("transformed_images: dtype")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(0));
@@ -59,4 +60,44 @@ transformed_images: 4D `Tensor`, image(s) in NHWC format, generated by applying
 the `transforms` to the `images`. Satisfies the description above.
 )doc");
 
+REGISTER_OP("BipartiteMatch")
+    .Input("distance_mat: float")
+    .Input("num_valid_rows: float")
+    .Attr("top_k: int = -1")
+    .Output("row_to_col_match_indices: int32")
+    .Output("col_to_row_match_indices: int32")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+      c->set_output(0, c->MakeShape({c->Dim(input, 0)}));
+      c->set_output(1, c->MakeShape({c->Dim(input, 1)}));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Find bipartite matching based on a given distance matrix.
+
+A greedy bi-partite matching alogrithm is used to obtain the matching with the
+(greedy) minimum distance.
+
+distance_mat: A 2-D float tensor of shape `[num_rows, num_columns]`. It is a
+  pair-wise distance matrix between the entities represented by each row and
+  each column. It is an asymmetric matrix. The smaller the distance is, the more
+  similar the pairs are. The bipartite matching is to minimize the distances.
+num_valid_rows: A scalar or a 1-D tensor with one element describing the
+  number of valid rows of distance_mat to consider for the bipartite matching.
+  If set to be negative, then all rows from `distance_mat` are used.
+top_k: A scalar that specifies the number of top-k matches to retrieve.
+  If set to be negative, then is set according to the maximum number of
+  matches from `distance_mat`.
+row_to_col_match_indices: A vector of length num_rows, which is the number of
+  rows of the input `distance_matrix`.
+  If `row_to_col_match_indices[i]` is not -1, row i is matched to column
+  `row_to_col_match_indices[i]`.
+col_to_row_match_indices: A vector of length num_columns, which is the number
+  of columns of the input ditance matrix.
+  If `col_to_row_match_indices[j]` is not -1, column j is matched to row
+  `col_to_row_match_indices[j]`.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index 4ce33de24a017e70a377a0262fe5f1405e9ad045..b8a0706b61449ebebeb2f1dc98b438f9dd620aa3 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
@@ -110,6 +111,139 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
                              [0, 1, 0, 1],
                              [0, 1, 1, 1]])
 
+  def test_bilinear(self):
+    with self.test_session():
+      image = constant_op.constant(
+          [[0, 0, 0, 0, 0],
+           [0, 1, 1, 1, 0],
+           [0, 1, 0, 1, 0],
+           [0, 1, 1, 1, 0],
+           [0, 0, 0, 0, 0]],
+          dtypes.float32)
+      # The following result matches:
+      # >>> scipy.ndimage.rotate(image, 45, order=1, reshape=False)
+      # which uses spline interpolation of order 1, equivalent to bilinear
+      # interpolation.
+      self.assertAllClose(
+          image_ops.rotate(image, np.pi / 4.0, interpolation="BILINEAR").eval(),
+          [[0.000, 0.000, 0.343, 0.000, 0.000],
+           [0.000, 0.586, 0.914, 0.586, 0.000],
+           [0.343, 0.914, 0.000, 0.914, 0.343],
+           [0.000, 0.586, 0.914, 0.586, 0.000],
+           [0.000, 0.000, 0.343, 0.000, 0.000]],
+          atol=0.001)
+      self.assertAllClose(
+          image_ops.rotate(image, np.pi / 4.0, interpolation="NEAREST").eval(),
+          [[0, 0, 1, 0, 0],
+           [0, 1, 1, 1, 0],
+           [1, 1, 0, 1, 1],
+           [0, 1, 1, 1, 0],
+           [0, 0, 1, 0, 0]])
+
+  def test_bilinear_uint8(self):
+    with self.test_session():
+      image = constant_op.constant(
+          np.asarray(
+              [[0.0, 0.0, 0.0, 0.0, 0.0],
+               [0.0, 255, 255, 255, 0.0],
+               [0.0, 255, 0.0, 255, 0.0],
+               [0.0, 255, 255, 255, 0.0],
+               [0.0, 0.0, 0.0, 0.0, 0.0]],
+              np.uint8),
+          dtypes.uint8)
+      # == np.rint((expected image above) * 255)
+      self.assertAllEqual(
+          image_ops.rotate(image, np.pi / 4.0, interpolation="BILINEAR").eval(),
+          [[0.0, 0.0, 87., 0.0, 0.0],
+           [0.0, 149, 233, 149, 0.0],
+           [87., 233, 0.0, 233, 87.],
+           [0.0, 149, 233, 149, 0.0],
+           [0.0, 0.0, 87., 0.0, 0.0]])
+
+  def _test_grad(self, shape_to_test):
+    with self.test_session():
+      test_image_shape = shape_to_test
+      test_image = np.random.randn(*test_image_shape)
+      test_image_tensor = constant_op.constant(
+          test_image, shape=test_image_shape)
+      test_transform = image_ops.angles_to_projective_transforms(
+          np.pi / 2, 4, 4)
+
+      output_shape = test_image_shape
+      output = image_ops.transform(test_image_tensor, test_transform)
+      left_err = gradient_checker.compute_gradient_error(
+          test_image_tensor,
+          test_image_shape,
+          output,
+          output_shape,
+          x_init_value=test_image)
+      self.assertLess(left_err, 1e-10)
+
+  def test_grad(self):
+    self._test_grad([16, 16])
+    self._test_grad([4, 12, 12])
+    self._test_grad([3, 4, 12, 12])
+
+
+class BipartiteMatchTest(test_util.TensorFlowTestCase):
+
+  def _BipartiteMatchTest(self, distance_mat, distance_mat_shape,
+                          num_valid_rows,
+                          expected_row_to_col_match,
+                          expected_col_to_row_match):
+    distance_mat_np = np.array(distance_mat, dtype=np.float32).reshape(
+        distance_mat_shape)
+    expected_row_to_col_match_np = np.array(expected_row_to_col_match,
+                                            dtype=np.int32)
+    expected_col_to_row_match_np = np.array(expected_col_to_row_match,
+                                            dtype=np.int32)
+
+    with self.test_session():
+      distance_mat_tf = constant_op.constant(distance_mat_np,
+                                             shape=distance_mat_shape)
+      location_to_prior, prior_to_location = image_ops.bipartite_match(
+          distance_mat_tf, num_valid_rows)
+      location_to_prior_np = location_to_prior.eval()
+      prior_to_location_np = prior_to_location.eval()
+      self.assertAllEqual(location_to_prior_np, expected_row_to_col_match_np)
+      self.assertAllEqual(prior_to_location_np, expected_col_to_row_match_np)
+
+  def testBipartiteMatch(self):
+    distance_mat = [0.5, 0.8, 0.1,
+                    0.3, 0.2, 0.15]
+    num_valid_rows = 2
+    expected_row_to_col_match = [2, 1]
+    expected_col_to_row_match = [-1, 1, 0]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
+    # The case of num_valid_rows less than num-of-rows-in-distance-mat.
+    num_valid_rows = 1
+    expected_row_to_col_match = [2, -1]
+    expected_col_to_row_match = [-1, -1, 0]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
+    # The case of num_valid_rows being 0.
+    num_valid_rows = 0
+    expected_row_to_col_match = [-1, -1]
+    expected_col_to_row_match = [-1, -1, -1]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
+    # The case of num_valid_rows less being -1.
+    num_valid_rows = -1
+    # The expected results are the same as num_valid_rows being 2.
+    expected_row_to_col_match = [2, 1]
+    expected_col_to_row_match = [-1, 1, 0]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
 
 if __name__ == "__main__":
   googletest.main()
+
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 889f361b19ee16e3492bee668919c046cef3a9d0..da374f8cef5ba8f5c5a3f2e2b30cc860e75d9919 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import resource_loader
 
@@ -36,7 +37,7 @@ _IMAGE_DTYPES = set(
 ops.RegisterShape("ImageProjectiveTransform")(common_shapes.call_cpp_shape_fn)
 
 
-def rotate(images, angles):
+def rotate(images, angles, interpolation="NEAREST"):
   """Rotate image(s) by the passed angle(s) in radians.
 
   Args:
@@ -45,6 +46,7 @@ def rotate(images, angles):
        (num_rows, num_columns) (HW).
     angles: A scalar angle to rotate all images by, or (if images has rank 4)
        a vector of length num_images, with an angle for each image in the batch.
+    interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
 
   Returns:
     Image(s) with the same type and shape as `images`, rotated by the given
@@ -69,7 +71,8 @@ def rotate(images, angles):
   image_width = math_ops.cast(array_ops.shape(images)[2], dtypes.float32)[None]
   output = transform(
       images,
-      angles_to_projective_transforms(angles, image_width, image_height))
+      angles_to_projective_transforms(angles, image_height, image_width),
+      interpolation=interpolation)
   if len(image_or_images.get_shape()) == 2:
     return output[0, :, :, 0]
   elif len(image_or_images.get_shape()) == 3:
@@ -119,7 +122,7 @@ def angles_to_projective_transforms(angles, image_height, image_width):
       axis=1)
 
 
-def transform(images, transforms):
+def transform(images, transforms, interpolation="NEAREST"):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -133,6 +136,7 @@ def transform(images, transforms):
        `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
        where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
        the transform mapping input points to output points.
+     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -162,8 +166,8 @@ def transform(images, transforms):
     transforms = transform_or_transforms
   else:
     raise TypeError("Transforms should have rank 1 or 2.")
-  # pylint: disable=protected-access
-  output = gen_image_ops.image_projective_transform(images, transforms)
+  output = gen_image_ops.image_projective_transform(
+      images, transforms, interpolation=interpolation.upper())
   if len(image_or_images.get_shape()) == 2:
     return output[0, :, :, 0]
   elif len(image_or_images.get_shape()) == 3:
@@ -214,4 +218,82 @@ def _transform_matrices_to_flat(transform_matrices):
   return transforms[:, :8]
 
 
-ops.NotDifferentiable("ImageProjectiveTransform")
+@ops.RegisterGradient("ImageProjectiveTransform")
+def _image_projective_transform_grad(op, grad):
+  """Computes the gradient for ImageProjectiveTransform."""
+  images = op.inputs[0]
+  transforms = op.inputs[1]
+  interpolation = op.get_attr("interpolation")
+
+  image_or_images = ops.convert_to_tensor(images, name="images")
+  transform_or_transforms = ops.convert_to_tensor(
+      transforms, name="transforms", dtype=dtypes.float32)
+
+  if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
+    raise TypeError("Invalid dtype %s." % image_or_images.dtype)
+  if len(image_or_images.get_shape()) == 2:
+    images = image_or_images[None, :, :, None]
+  elif len(image_or_images.get_shape()) == 3:
+    images = image_or_images[None, :, :, :]
+  elif len(image_or_images.get_shape()) == 4:
+    images = image_or_images
+  else:
+    raise TypeError("Images should have rank between 2 and 4")
+  if len(transform_or_transforms.get_shape()) == 1:
+    transforms = transform_or_transforms[None]
+  elif len(transform_or_transforms.get_shape()) == 2:
+    transforms = transform_or_transforms
+  else:
+    raise TypeError("Transforms should have rank 1 or 2.")
+
+  # Invert transformations
+  transforms = _flat_transforms_to_matrices(transforms=transforms)
+  inverse = linalg_ops.matrix_inverse(transforms)
+  transforms = _transform_matrices_to_flat(inverse)
+  output = gen_image_ops.image_projective_transform(
+      grad, transforms, interpolation=interpolation)
+  if len(image_or_images.get_shape()) == 2:
+    return [output[0, :, :, 0], None]
+  elif len(image_or_images.get_shape()) == 3:
+    return [output[0, :, :, :], None]
+  else:
+    return [output, None]
+
+
+def bipartite_match(
+    distance_mat,
+    num_valid_rows,
+    top_k=-1):
+  """Find bipartite matching based on a given distance matrix.
+
+  A greedy bi-partite matching alogrithm is used to obtain the matching with
+  the (greedy) minimum distance.
+
+  Args:
+    distance_mat: A 2-D float tensor of shape `[num_rows, num_columns]`. It is a
+      pair-wise distance matrix between the entities represented by each row and
+      each column. It is an asymmetric matrix. The smaller the distance is, the
+      more similar the pairs are. The bipartite matching is to minimize the
+      distances.
+    num_valid_rows: A scalar or a 1-D tensor with one element describing the
+      number of valid rows of distance_mat to consider for the bipartite
+      matching. If set to be negative, then all rows from `distance_mat` are
+      used.
+    top_k: A scalar that specifies the number of top-k matches to retrieve.
+      If set to be negative, then is set according to the maximum number of
+      matches from `distance_mat`.
+
+  Returns:
+    row_to_col_match_indices: A vector of length num_rows, which is the number
+      of rows of the input `distance_matrix`. If `row_to_col_match_indices[i]`
+      is not -1, row i is matched to column `row_to_col_match_indices[i]`.
+    col_to_row_match_indices: A vector of length num_columns, which is the
+      number of columns of the input ditance matrix.
+      If `col_to_row_match_indices[j]` is not -1, column j is matched to row
+      `col_to_row_match_indices[j]`.
+  """
+  result = gen_image_ops.bipartite_match(distance_mat, num_valid_rows, top_k)
+  return result
+
+
+ops.NotDifferentiable("BipartiteMatch")
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index 5166ba37a3503be281b27019208f9d652d081952..b1b8fc49b64de8305b8cb2bcd01e2884ca9e9805 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -119,6 +119,7 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
diff --git a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
index 2f579f2d2812c6e9ac54a0900363eb5f64871941..36db34f592d839619112a1945c31fbcdbd2cfaf4 100644
--- a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.keras.python.keras.callbacks import ModelCheckpoint
 from tensorflow.contrib.keras.python.keras.callbacks import ProgbarLogger
 from tensorflow.contrib.keras.python.keras.callbacks import ReduceLROnPlateau
 from tensorflow.contrib.keras.python.keras.callbacks import RemoteMonitor
+from tensorflow.contrib.keras.python.keras.callbacks import TensorBoard
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index e52b23843a25f36ab86c94b1857e850f0664256f..905ef13e143ac5dcefbd4433f57501d9d1faa060 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -21,7 +21,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import defaultdict
 import json
 import os
 import warnings
@@ -245,17 +244,40 @@ def set_image_data_format(data_format):
 
 
 def get_uid(prefix=''):
-  global _GRAPH_UID_DICTS  # pylint: disable=global-variable-not-assigned
-  graph = ops.get_default_graph()
-  if graph not in _GRAPH_UID_DICTS:
-    _GRAPH_UID_DICTS[graph] = defaultdict(int)
-  _GRAPH_UID_DICTS[graph][prefix] += 1
-  return _GRAPH_UID_DICTS[graph][prefix]
+  """Associates a string prefix with an integer counter in a TensorFlow graph.
+
+  Arguments:
+    prefix: String prefix to index.
+
+  Returns:
+    Unique integer ID.
+
+  Example:
+
+  ```
+    >>> get_uid('dense')
+    1
+    >>> get_uid('dense')
+    2
+  ```
+  """
+  layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS')
+  if not layer_name_uids_collection:
+    layer_name_uids = {}
+    ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids)
+  else:
+    layer_name_uids = layer_name_uids_collection[0]
+  if prefix not in layer_name_uids:
+    layer_name_uids[prefix] = 1
+  else:
+    layer_name_uids[prefix] += 1
+  return layer_name_uids[prefix]
 
 
 def reset_uids():
-  global _GRAPH_UID_DICTS
-  _GRAPH_UID_DICTS = {}
+  layer_name_uids_collection = ops.get_collection_ref('LAYER_NAME_UIDS')
+  if layer_name_uids_collection:
+    layer_name_uids_collection.pop()
 
 
 def clear_session():
diff --git a/tensorflow/contrib/keras/python/keras/datasets/imdb.py b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
index bafd92aca69af1c266342b3ed61cfbec90656be7..5c087fe63f5f03c8741336ef285239f39470997e 100644
--- a/tensorflow/contrib/keras/python/keras/datasets/imdb.py
+++ b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
@@ -41,7 +41,7 @@ def load_data(path='imdb.npz',
       num_words: max number of words to include. Words are ranked
           by how often they occur (in the training set) and only
           the most frequent words are kept
-      skip_top: skip the top N most frequently occuring words
+      skip_top: skip the top N most frequently occurring words
           (which may not be informative).
       maxlen: truncate sequences after this length.
       seed: random seed for sample shuffling.
diff --git a/tensorflow/contrib/keras/python/keras/datasets/reuters.py b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
index 81e940a8463a2fad1b7006e62e1a801578095a3a..b1c22fee63d23d10d7f6b6ddac2245915688ad4a 100644
--- a/tensorflow/contrib/keras/python/keras/datasets/reuters.py
+++ b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
@@ -43,7 +43,7 @@ def load_data(path='reuters.npz',
       num_words: max number of words to include. Words are ranked
           by how often they occur (in the training set) and only
           the most frequent words are kept
-      skip_top: skip the top N most frequently occuring words
+      skip_top: skip the top N most frequently occurring words
           (which may not be informative).
       maxlen: truncate sequences after this length.
       test_split: Fraction of the dataset to be used as test data.
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology.py b/tensorflow/contrib/keras/python/keras/engine/topology.py
index 7848e5982ddf1a8ad215f6c77bdb41304045a0b1..3d9ed51a1c0b4522ec15dd19909f411d78357a90 100644
--- a/tensorflow/contrib/keras/python/keras/engine/topology.py
+++ b/tensorflow/contrib/keras/python/keras/engine/topology.py
@@ -29,11 +29,12 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.keras.python.keras import backend as K
-from tensorflow.contrib.keras.python.keras import initializers
 from tensorflow.contrib.keras.python.keras.utils import conv_utils
 from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.contrib.keras.python.keras.utils.layer_utils import print_summary as print_layer_summary
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.util import tf_inspect
 
 
@@ -207,7 +208,7 @@ class Node(object):
     }
 
 
-class Layer(object):
+class Layer(tf_base_layers.Layer):
   """Abstract base layer class.
 
   # Properties
@@ -276,24 +277,6 @@ class Layer(object):
   """
 
   def __init__(self, **kwargs):
-    self.input_spec = None
-    self.supports_masking = False
-
-    # These properties will be set upon call of self.build()
-    self._trainable_weights = []
-    self._non_trainable_weights = []
-    self._constraints = {}  # dict {tensor: constraint instance}
-    self._losses = []
-    self._updates = []
-    self._per_input_losses = {}
-    self._per_input_updates = {}
-    self._built = False
-
-    # These lists will be filled via successive calls
-    # to self._add_inbound_node().
-    self.inbound_nodes = []
-    self.outbound_nodes = []
-
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
@@ -306,18 +289,38 @@ class Layer(object):
         'name',
         'trainable',
         'weights',
-        'input_dtype',  # legacy
     }
+    # Validate optional keyword arguments.
     for kwarg in kwargs:
       if kwarg not in allowed_kwargs:
         raise TypeError('Keyword argument not understood:', kwarg)
+
+    # Get layer name.
     name = kwargs.get('name')
-    if not name:
-      prefix = self.__class__.__name__
-      name = _to_snake_case(prefix) + '_' + str(K.get_uid(prefix))
-    self.name = name
 
-    self.trainable = kwargs.get('trainable', True)
+    # Get `trainable` status.
+    trainable = kwargs.get('trainable', True)
+
+    # Get `dtype`.
+    dtype = kwargs.get('dtype')
+    if dtype is None:
+      dtype = K.floatx()
+
+    # Call super, which will set all properties common to Keras layers
+    # and core TF layers.
+    super(Layer, self).__init__(name=name, dtype=dtype, trainable=trainable)
+
+    # Add properties that are Keras-only for now.
+    self.input_spec = None
+    self.supports_masking = False
+    self._constraints = {}  # dict {tensor: constraint instance}
+
+    # These lists will be filled via successive calls
+    # to self._add_inbound_node().
+    self.inbound_nodes = []
+    self.outbound_nodes = []
+
+    # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
       # In this case we will later create an input layer
       # to insert before the current layer
@@ -331,35 +334,12 @@ class Layer(object):
         batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
       self.batch_input_shape = batch_input_shape
 
-      # Set dtype.
-      dtype = kwargs.get('dtype')
-      if dtype is None:
-        dtype = kwargs.get('input_dtype')
-      if dtype is None:
-        dtype = K.floatx()
-      self.dtype = dtype
-
+    # Manage initial weight values if passed.
     if 'weights' in kwargs:
       self._initial_weights = kwargs['weights']
     else:
       self._initial_weights = None
 
-  @property
-  def losses(self):
-    return self._losses
-
-  @property
-  def updates(self):
-    return self._updates
-
-  @property
-  def built(self):
-    return self._built
-
-  @built.setter
-  def built(self, value):
-    self._built = value
-
   @property
   def constraints(self):
     return self._constraints
@@ -368,63 +348,37 @@ class Layer(object):
   def constraints(self, constraints):
     self._constraints = constraints
 
-  @property
-  def trainable_weights(self):
-    trainable = getattr(self, 'trainable', True)
-    if trainable:
-      return self._trainable_weights
-    else:
-      return []
-
-  @trainable_weights.setter
-  def trainable_weights(self, weights):
-    self._trainable_weights = weights
-
-  @property
-  def non_trainable_weights(self):
-    trainable = getattr(self, 'trainable', True)
-    if not trainable:
-      return self._trainable_weights + self._non_trainable_weights
-    else:
-      return self._non_trainable_weights
-
-  @non_trainable_weights.setter
-  def non_trainable_weights(self, weights):
-    self._non_trainable_weights = weights
-
   def add_weight(self,
+                 name,
                  shape,
-                 initializer,
-                 name=None,
-                 trainable=True,
+                 dtype=None,
+                 initializer=None,
                  regularizer=None,
+                 trainable=True,
                  constraint=None):
     """Adds a weight variable to the layer.
 
     Arguments:
+        name: String, the name for the weight variable.
         shape: The shape tuple of the weight.
+        dtype: The dtype of the weight.
         initializer: An Initializer instance (callable).
-        name: String, the name for the weight variable.
+        regularizer: An optional Regularizer instance.
         trainable: A boolean, whether the weight should
             be trained via backprop or not (assuming
             that the layer itself is also trainable).
-        regularizer: An optional Regularizer instance.
         constraint: An optional Constraint instance.
 
     Returns:
         The created weight variable.
     """
-    shape = tuple(tensor_shape.TensorShape(shape).as_list())
-    initializer = initializers.get(initializer)
-    weight = K.variable(initializer(shape), dtype=K.floatx(), name=name)
-    if regularizer is not None:
-      self.add_loss(regularizer(weight))
+    if dtype is None:
+      dtype = K.floatx()
+    weight = self.add_variable(
+        name, shape, dtype=dtype,
+        initializer=initializer, regularizer=regularizer, trainable=trainable)
     if constraint is not None:
       self.constraints[weight] = constraint
-    if trainable:
-      self._trainable_weights.append(weight)
-    else:
-      self._non_trainable_weights.append(weight)
     return weight
 
   def assert_input_compatibility(self, inputs):
@@ -554,66 +508,46 @@ class Layer(object):
     """
     if isinstance(inputs, list):
       inputs = inputs[:]
+
+    # Raise exceptions in case the input is not compatible
+    # with the input_spec set at build time.
+    # TODO(fchollet): call after the layer is built, too.
+    self.assert_input_compatibility(inputs)
+
+    # Handle mask propagation.
+    previous_mask = _collect_previous_mask(inputs)
+    user_kwargs = copy.copy(kwargs)
+    if not _is_all_none(previous_mask):
+      # The previous layer generated a mask.
+      if 'mask' in tf_inspect.getargspec(self.call).args:
+        if 'mask' not in kwargs:
+          # If mask is explicitly passed to __call__,
+          # we should override the default mask.
+          kwargs['mask'] = previous_mask
+
+    # Actually call the layer (optionally building it).
+    output = super(Layer, self).__call__(inputs, **kwargs)
+
+    # Handle mask computation.
     with K.name_scope(self.name):
-      # Handle laying building (weight creating, input spec locking).
-      if not self.built:
-        # Raise exceptions in case the input is not compatible
-        # with the input_spec specified in the layer constructor.
-        self.assert_input_compatibility(inputs)
-
-        # Collect input shapes to build layer.
-        input_shapes = []
-        for x_elem in _to_list(inputs):
-          input_shapes.append(K.int_shape(x_elem))
-        if len(input_shapes) == 1:
-          self.build(input_shapes[0])
-        else:
-          self.build(input_shapes)
-        self.built = True
-
-        # Load weights that were specified at layer instantiation.
-        if self._initial_weights is not None:
-          self.set_weights(self._initial_weights)
-
-      # Raise exceptions in case the input is not compatible
-      # with the input_spec set at build time.
-      self.assert_input_compatibility(inputs)
-
-      # Handle mask propagation.
-      previous_mask = _collect_previous_mask(inputs)
-      user_kwargs = copy.copy(kwargs)
-      if not _is_all_none(previous_mask):
-        # The previous layer generated a mask.
-        if 'mask' in tf_inspect.getargspec(self.call).args:
-          if 'mask' not in kwargs:
-            # If mask is explicitly passed to __call__,
-            # we should override the default mask.
-            kwargs['mask'] = previous_mask
-
-      # Actually call the layer, collecting output(s), mask(s), and shape(s).
-      output = self.call(inputs, **kwargs)
       output_mask = self.compute_mask(inputs, previous_mask)
 
-      # Add an inbound node to the layer, so that it keeps track
-      # of the call and of all new variables created during the call.
-      # This also updates the layer history of the output tensor(s).
-      # If the input tensor(s) had not previous Keras history,
-      # this does nothing.
-      self._add_inbound_node(
-          input_tensors=inputs,
-          output_tensors=output,
-          input_masks=previous_mask,
-          output_masks=output_mask,
-          arguments=user_kwargs)
-
-      # Apply activity regularizer if any:
-      if hasattr(
-          self,
-          'activity_regularizer') and self.activity_regularizer is not None:
-        regularization_losses = [
-            self.activity_regularizer(x) for x in _to_list(output)
-        ]
-        self.add_loss(regularization_losses, _to_list(inputs))
+    # Add an inbound node to the layer, so that it keeps track
+    # of the call and of all new variables created during the call.
+    # This also updates the layer history of the output tensor(s).
+    # If the input tensor(s) had not previous Keras history,
+    # this does nothing.
+    self._add_inbound_node(
+        input_tensors=inputs,
+        output_tensors=output,
+        input_masks=previous_mask,
+        output_masks=output_mask,
+        arguments=user_kwargs)
+
+    # Optionally load weight values that were specified at layer instantiation.
+    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
+      self.set_weights(self._initial_weights)
+      del self._initial_weights
     return output
 
   def _add_inbound_node(self,
@@ -715,7 +649,7 @@ class Layer(object):
                           'but was passed an input_mask: ' + str(mask))
       # masking not explicitly supported: return None as mask
       return None
-    # if masking is explictly supported, by default
+    # if masking is explicitly supported, by default
     # carry over the input mask
     return mask
 
@@ -959,14 +893,14 @@ class Layer(object):
 
   @property
   def input_shape(self):
-    """Retrieves the input shape tuple(s) of a layer.
+    """Retrieves the input shape(s) of a layer.
 
     Only applicable if the layer has exactly one inbound node,
     i.e. if it is connected to one incoming layer.
 
     Returns:
-        Input shape tuple
-        (or list of input shape tuples, one tuple per input tensor).
+        Input shape, as `TensorShape`
+        (or list of `TensorShape`, one tuple per input tensor).
 
     Raises:
         AttributeError: if the layer is connected to
@@ -997,14 +931,14 @@ class Layer(object):
 
   @property
   def output_shape(self):
-    """Retrieves the output shape tuple(s) of a layer.
+    """Retrieves the output shape(s) of a layer.
 
     Only applicable if the layer has one inbound node,
     or if all inbound nodes have the same output shape.
 
     Returns:
-        Output shape tuple
-        (or list of input shape tuples, one tuple per output tensor).
+        Output shape, as `TensorShape`
+        (or list of `TensorShape`, one tuple per output tensor).
 
     Raises:
         AttributeError: if the layer is connected to
@@ -1033,94 +967,6 @@ class Layer(object):
                            'Use `get_output_shape_at(node_index)` '
                            'instead.')
 
-  def add_loss(self, losses, inputs=None):
-    """Add losses to the layer.
-
-    The loss may potentially be conditional on some inputs tensors,
-    for instance activity losses are conditional on the layer's inputs.
-
-    Arguments:
-        losses: loss tensor or list of loss tensors
-            to add to the layer.
-        inputs: input tensor or list of inputs tensors to mark
-            the losses as conditional on these inputs.
-            If None is passed, the loss is assumed unconditional
-            (e.g. L2 weight regularization, which only depends
-            on the layer's weights variables, not on any inputs tensors).
-    """
-    if losses is None or losses == []:  # pylint: disable=g-explicit-bool-comparison
-      return
-    # Update self.losses
-    losses = _to_list(losses)
-    if hasattr(self, '_losses'):
-      self._losses += losses
-    # Update self._per_input_updates
-    if inputs == []:  # pylint: disable=g-explicit-bool-comparison
-      inputs = None
-    if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
-    else:
-      # Updates indexed by None are unconditional
-      # rather than input-dependent
-      inputs_hash = None
-    if inputs_hash not in self._per_input_losses:
-      self._per_input_losses[inputs_hash] = []
-    self._per_input_losses[inputs_hash] += losses
-
-  def add_update(self, updates, inputs=None):
-    """Add updates to the layer.
-
-    The updates may potentially be conditional on some inputs tensors,
-    for instance batch norm updates are conditional on the layer's inputs.
-
-    Arguments:
-        updates: update op or list of update ops
-            to add to the layer.
-        inputs: input tensor or list of inputs tensors to mark
-            the updates as conditional on these inputs.
-            If None is passed, the updates are assumed unconditional.
-    """
-    if updates is None or updates == []:  # pylint: disable=g-explicit-bool-comparison
-      return
-    # Update self.updates
-    updates = _to_list(updates)
-    if hasattr(self, '_updates'):
-      self._updates += updates
-    # Update self._per_input_updates
-    if inputs == []:  # pylint: disable=g-explicit-bool-comparison
-      inputs = None
-    if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
-    else:
-      # Updates indexed by None are unconditional
-      # rather than input-dependent
-      inputs_hash = None
-    if inputs_hash not in self._per_input_updates:
-      self._per_input_updates[inputs_hash] = []
-    self._per_input_updates[inputs_hash] += updates
-
-  def get_updates_for(self, inputs):
-    if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
-    else:
-      inputs_hash = None
-    if inputs_hash in self._per_input_updates:
-      return self._per_input_updates[inputs_hash]
-    return []
-
-  def get_losses_for(self, inputs):
-    if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
-    else:
-      inputs_hash = None
-    if inputs_hash in self._per_input_losses:
-      return self._per_input_losses[inputs_hash]
-    return []
-
-  @property
-  def weights(self):
-    return self.trainable_weights + self.non_trainable_weights
-
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
 
@@ -1254,9 +1100,12 @@ class InputLayer(Layer):
     if not name:
       prefix = 'input'
       name = prefix + '_' + str(K.get_uid(prefix))
+    if not dtype:
+      if input_tensor is None:
+        dtype = K.floatx()
+      else:
+        dtype = K.dtype(input_tensor)
     super(InputLayer, self).__init__(dtype=dtype, name=name)
-
-    self.trainable = False
     self.built = True
     self.sparse = sparse
 
@@ -1284,15 +1133,7 @@ class InputLayer(Layer):
         batch_input_shape = (batch_size,) + tuple(input_shape)
     else:
       batch_input_shape = tuple(batch_input_shape)
-
-    if not dtype:
-      if input_tensor is None:
-        dtype = K.floatx()
-      else:
-        dtype = K.dtype(input_tensor)
-
     self.batch_input_shape = batch_input_shape
-    self.dtype = dtype
 
     if input_tensor is None:
       self.is_placeholder = True
@@ -1446,12 +1287,19 @@ class Container(Layer):
       prefix = self.__class__.__name__.lower()
       name = prefix + '_' + str(K.get_uid(prefix))
     self.name = name
-
     self.supports_masking = False
     self.trainable = True
     self._per_input_losses = {}
     self._per_input_updates = {}
 
+    # The following properties are not actually used by Keras;
+    # they exist for compatibility with TF.
+    self._updates = []
+    self._scope = None
+    self._reuse = None
+    self._base_name = name
+    self._graph = ops.get_default_graph()
+
     # Container-specific properties.
     if isinstance(inputs, (list, tuple)):
       self.inputs = list(inputs)  # Tensor or list of tensors.
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology_test.py b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
index eb095b14a9742995249de21c8d4613409896c988..531ed4be3e3672eb45f982ff6d9bb471bf47d7cc 100644
--- a/tensorflow/contrib/keras/python/keras/engine/topology_test.py
+++ b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
@@ -490,8 +490,8 @@ class TopologyConstructionTest(test.TestCase):
     m, n = model([j, k])
     tf_model = keras.models.Model([j, k], [m, n])
 
-    j_tf = array_ops.placeholder(dtype=dtypes.float32)
-    k_tf = array_ops.placeholder(dtype=dtypes.float32)
+    j_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
+    k_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
     m_tf, n_tf = tf_model([j_tf, k_tf])
     self.assertListEqual(m_tf.get_shape().as_list(), [None, 64])
     self.assertListEqual(n_tf.get_shape().as_list(), [None, 5])
diff --git a/tensorflow/contrib/keras/python/keras/engine/training.py b/tensorflow/contrib/keras/python/keras/engine/training.py
index ba6201713eda100d01174d25f688f6386cef5741..96d1c2f262259a0cd7030736997d9501468e2075 100644
--- a/tensorflow/contrib/keras/python/keras/engine/training.py
+++ b/tensorflow/contrib/keras/python/keras/engine/training.py
@@ -245,7 +245,7 @@ def _check_array_lengths(inputs, targets, weights):
 
 
 def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
-  """Does validation on the compatiblity of targets and loss functions.
+  """Does validation on the compatibility of targets and loss functions.
 
   This helps prevent users from using loss functions incorrectly.
 
diff --git a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
index b3abfc29d250514c5d2544df04a6d6c343a82755..2c957ece4466660cd5e62aa1fcc9dc9f9052091d 100644
--- a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
+++ b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
@@ -120,7 +120,7 @@ class PReLU(Layer):
         param_shape[i - 1] = 1
         self.param_broadcast[i - 1] = True
     self.alpha = self.add_weight(
-        param_shape,
+        shape=param_shape,
         name='alpha',
         initializer=self.alpha_initializer,
         regularizer=self.alpha_regularizer,
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional.py b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
index 38b8fe66a34f3e91d241800a3f4cd3d379c2a5dd..16f49c3390521b759a8e2f7bb399f935d34d1a1c 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
@@ -140,14 +140,14 @@ class _Conv(Layer):
     kernel_shape = self.kernel_size + (input_dim, self.filters)
 
     self.kernel = self.add_weight(
-        kernel_shape,
+        shape=kernel_shape,
         initializer=self.kernel_initializer,
         name='kernel',
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.filters,),
+          shape=(self.filters,),
           initializer=self.bias_initializer,
           name='bias',
           regularizer=self.bias_regularizer,
@@ -734,14 +734,14 @@ class Conv2DTranspose(Conv2D):
     kernel_shape = self.kernel_size + (self.filters, input_dim)
 
     self.kernel = self.add_weight(
-        kernel_shape,
+        shape=kernel_shape,
         initializer=self.kernel_initializer,
         name='kernel',
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.filters,),
+          shape=(self.filters,),
           initializer=self.bias_initializer,
           name='bias',
           regularizer=self.bias_regularizer,
@@ -949,13 +949,13 @@ class SeparableConv2D(Conv2D):
                               self.filters)
 
     self.depthwise_kernel = self.add_weight(
-        depthwise_kernel_shape,
+        shape=depthwise_kernel_shape,
         initializer=self.depthwise_initializer,
         name='depthwise_kernel',
         regularizer=self.depthwise_regularizer,
         constraint=self.depthwise_constraint)
     self.pointwise_kernel = self.add_weight(
-        pointwise_kernel_shape,
+        shape=pointwise_kernel_shape,
         initializer=self.pointwise_initializer,
         name='pointwise_kernel',
         regularizer=self.pointwise_regularizer,
@@ -963,7 +963,7 @@ class SeparableConv2D(Conv2D):
 
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.filters,),
+          shape=(self.filters,),
           initializer=self.bias_initializer,
           name='bias',
           regularizer=self.bias_regularizer,
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
index 4d8ef44da7bbd2baec10f0f8424fe21309ddd281..30325b7148ee0a425cb5c47135ab4a6b8495868e 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
@@ -369,20 +369,20 @@ class ConvLSTM2D(ConvRecurrent2D):
     recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
 
     self.kernel = self.add_weight(
-        kernel_shape,
+        shape=kernel_shape,
         initializer=self.kernel_initializer,
         name='kernel',
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
-        recurrent_kernel_shape,
+        shape=recurrent_kernel_shape,
         initializer=self.recurrent_initializer,
         name='recurrent_kernel',
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.filters * 4,),
+          shape=(self.filters * 4,),
           initializer=self.bias_initializer,
           name='bias',
           regularizer=self.bias_regularizer,
diff --git a/tensorflow/contrib/keras/python/keras/layers/core.py b/tensorflow/contrib/keras/python/keras/layers/core.py
index 32ada176a4f239b4c07992d66037a0b5a64fd71d..0b6cdc65a4f61e69cb3395441a25b927cb3b78ea 100644
--- a/tensorflow/contrib/keras/python/keras/layers/core.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@@ -34,6 +34,7 @@ from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserializ
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_dump
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_load
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import core as tf_core_layers
 from tensorflow.python.util import tf_inspect
 
 
@@ -643,7 +644,7 @@ class Lambda(Layer):
     return cls(**config)
 
 
-class Dense(Layer):
+class Dense(tf_core_layers.Dense, Layer):
   """Just your regular densely-connected NN layer.
 
   `Dense` implements the operation:
@@ -712,15 +713,20 @@ class Dense(Layer):
                **kwargs):
     if 'input_shape' not in kwargs and 'input_dim' in kwargs:
       kwargs['input_shape'] = (kwargs.pop('input_dim'),)
-    super(Dense, self).__init__(**kwargs)
-    self.units = units
-    self.activation = activations.get(activation)
-    self.use_bias = use_bias
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    # Inheritance call order:
+    # 1) tf.layers.Dense, 2) keras.layers.Layer, 3) tf.layers.Layer
+    super(Dense, self).__init__(
+        units,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(min_ndim=2)
@@ -729,41 +735,14 @@ class Dense(Layer):
   def build(self, input_shape):
     assert len(input_shape) >= 2
     input_dim = input_shape[-1]
-
-    self.kernel = self.add_weight(
-        (input_dim, self.units),
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          (self.units,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
+    super(Dense, self).build(input_shape)
     self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
     self.built = True
 
-  def call(self, inputs):
-    output = K.dot(inputs, self.kernel)
-    if self.use_bias:
-      output = K.bias_add(output, self.bias)
-    if self.activation is not None:
-      output = self.activation(output)
-    return output
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    assert input_shape and len(input_shape) >= 2
-    assert input_shape[-1]
-    output_shape = list(input_shape)
-    output_shape[-1] = self.units
-    return tensor_shape.TensorShape(output_shape)
-
   def get_config(self):
     config = {
         'units':
diff --git a/tensorflow/contrib/keras/python/keras/layers/core_test.py b/tensorflow/contrib/keras/python/keras/layers/core_test.py
index d7aa8413bbe6fd3d22f7a6aa39bea007915206a3..7066af0ef6ce80919fa8b0ab77f9729d7d4c778c 100644
--- a/tensorflow/contrib/keras/python/keras/layers/core_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core_test.py
@@ -165,24 +165,23 @@ class CoreLayersTest(test.TestCase):
           3,
           kernel_regularizer=keras.regularizers.l1(0.01),
           bias_regularizer='l1',
-          activity_regularizer='l2')
-      layer.build((None, 4))
-      assert len(layer.losses) == 2
+          activity_regularizer='l2',
+          name='dense_reg')
       layer(keras.backend.variable(np.ones((2, 4))))
-      assert len(layer.losses) == 3
+      self.assertEqual(3, len(layer.losses))
 
     # Test constraints
     with self.test_session():
       layer = keras.layers.Dense(
           3, kernel_constraint='max_norm', bias_constraint='max_norm')
-      layer.build((None, 4))
-      assert len(layer.constraints) == 2
+      layer(keras.backend.variable(np.ones((2, 4))))
+      self.assertEqual(2, len(layer.constraints))
 
   def test_activity_regularization(self):
     with self.test_session():
       layer = keras.layers.ActivityRegularization(l1=0.1)
       layer(keras.backend.variable(np.ones((2, 4))))
-      assert len(layer.losses) == 1
+      self.assertEqual(1, len(layer.losses))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings.py b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
index 12a2ce39ebdc8bc900e9c373e50562803cfa6524..bc0bae67d05275346e40791e2a6d58a6b89bdf30 100644
--- a/tensorflow/contrib/keras/python/keras/layers/embeddings.py
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
@@ -116,7 +116,7 @@ class Embedding(Layer):
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     self.embeddings = self.add_weight(
-        (self.input_dim, self.output_dim),
+        shape=(self.input_dim, self.output_dim),
         initializer=self.embeddings_initializer,
         name='embeddings',
         regularizer=self.embeddings_regularizer,
diff --git a/tensorflow/contrib/keras/python/keras/layers/local.py b/tensorflow/contrib/keras/python/keras/layers/local.py
index d96ccc4a63d3d97d382b16dcf2548e8a2e24440f..863674c1cbd95f2a93e24297278fc9e60800bc14 100644
--- a/tensorflow/contrib/keras/python/keras/layers/local.py
+++ b/tensorflow/contrib/keras/python/keras/layers/local.py
@@ -130,14 +130,14 @@ class LocallyConnected1D(Layer):
     self.kernel_shape = (output_length, self.kernel_size[0] * input_dim,
                          self.filters)
     self.kernel = self.add_weight(
-        self.kernel_shape,
+        shape=self.kernel_shape,
         initializer=self.kernel_initializer,
         name='kernel',
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (output_length, self.filters),
+          shape=(output_length, self.filters),
           initializer=self.bias_initializer,
           name='bias',
           regularizer=self.bias_regularizer,
@@ -340,14 +340,14 @@ class LocallyConnected2D(Layer):
         output_row * output_col,
         self.kernel_size[0] * self.kernel_size[1] * input_filter, self.filters)
     self.kernel = self.add_weight(
-        self.kernel_shape,
+        shape=self.kernel_shape,
         initializer=self.kernel_initializer,
         name='kernel',
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (output_row, output_col, self.filters),
+          shape=(output_row, output_col, self.filters),
           initializer=self.bias_initializer,
           name='bias',
           regularizer=self.bias_regularizer,
diff --git a/tensorflow/contrib/keras/python/keras/layers/merge.py b/tensorflow/contrib/keras/python/keras/layers/merge.py
index 7c6482d0de15200125b06c78f59ac28a645916ed..b4bb9935fdef01f28896056a745d66c91a31d745 100644
--- a/tensorflow/contrib/keras/python/keras/layers/merge.py
+++ b/tensorflow/contrib/keras/python/keras/layers/merge.py
@@ -87,6 +87,7 @@ class _Merge(Layer):
       raise ValueError('A merge layer should be called '
                        'on a list of at least 2 inputs. '
                        'Got ' + str(len(input_shape)) + ' inputs.')
+    input_shape = [tensor_shape.TensorShape(s).as_list() for s in input_shape]
     batch_sizes = [s[0] for s in input_shape if s is not None]
     batch_sizes = set(batch_sizes)
     batch_sizes -= set([None])
@@ -110,6 +111,7 @@ class _Merge(Layer):
       self._reshape_required = False
     else:
       self._reshape_required = True
+    self.built = True
 
   def call(self, inputs):
     if self._reshape_required:
@@ -301,6 +303,7 @@ class Concatenate(_Merge):
                        'inputs with matching shapes '
                        'except for the concat axis. '
                        'Got inputs shapes: %s' % (input_shape))
+    self.built = True
 
   def call(self, inputs):
     if not isinstance(inputs, list):
@@ -413,6 +416,7 @@ class Dot(_Merge):
       raise ValueError('Dimension incompatibility '
                        '%s != %s. ' % (shape1[axes[0]], shape2[axes[1]]) +
                        'Layer shapes: %s, %s' % (shape1, shape2))
+    self.built = True
 
   def call(self, inputs):
     x1 = inputs[0]
diff --git a/tensorflow/contrib/keras/python/keras/layers/normalization.py b/tensorflow/contrib/keras/python/keras/layers/normalization.py
index 9a0340aeafb72ede4fe7edef554c56e195eda62f..ea229fdce1fd32f81d13e588c14554589e934cbf 100644
--- a/tensorflow/contrib/keras/python/keras/layers/normalization.py
+++ b/tensorflow/contrib/keras/python/keras/layers/normalization.py
@@ -116,7 +116,7 @@ class BatchNormalization(Layer):
 
     if self.scale:
       self.gamma = self.add_weight(
-          shape,
+          shape=shape,
           name='gamma',
           initializer=self.gamma_initializer,
           regularizer=self.gamma_regularizer,
@@ -125,7 +125,7 @@ class BatchNormalization(Layer):
       self.gamma = None
     if self.center:
       self.beta = self.add_weight(
-          shape,
+          shape=shape,
           name='beta',
           initializer=self.beta_initializer,
           regularizer=self.beta_regularizer,
@@ -133,12 +133,12 @@ class BatchNormalization(Layer):
     else:
       self.beta = None
     self.moving_mean = self.add_weight(
-        shape,
+        shape=shape,
         name='moving_mean',
         initializer=self.moving_mean_initializer,
         trainable=False)
     self.moving_variance = self.add_weight(
-        shape,
+        shape=shape,
         name='moving_variance',
         initializer=self.moving_variance_initializer,
         trainable=False)
@@ -169,7 +169,7 @@ class BatchNormalization(Layer):
 
       def normalize_inference():
         if needs_broadcasting:
-          # In this case we must explictly broadcast all parameters.
+          # In this case we must explicitly broadcast all parameters.
           broadcast_moving_mean = K.reshape(self.moving_mean, broadcast_shape)
           broadcast_moving_variance = K.reshape(self.moving_variance,
                                                 broadcast_shape)
diff --git a/tensorflow/contrib/keras/python/keras/layers/recurrent.py b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
index 1ea1cb22d94bde74bafcd9e0110fc1713c3373a7..e608921adda74316b20647c3315e7275090324d0 100644
--- a/tensorflow/contrib/keras/python/keras/layers/recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@@ -493,20 +493,20 @@ class SimpleRNN(Recurrent):
       self.reset_states()
 
     self.kernel = self.add_weight(
-        (self.input_dim, self.units),
+        shape=(self.input_dim, self.units),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
-        (self.units, self.units),
+        shape=(self.units, self.units),
         name='recurrent_kernel',
         initializer=self.recurrent_initializer,
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.units,),
+          shape=(self.units,),
           name='bias',
           initializer=self.bias_initializer,
           regularizer=self.bias_regularizer,
@@ -723,13 +723,13 @@ class GRU(Recurrent):
       self.reset_states()
 
     self.kernel = self.add_weight(
-        (self.input_dim, self.units * 3),
+        shape=(self.input_dim, self.units * 3),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
-        (self.units, self.units * 3),
+        shape=(self.units, self.units * 3),
         name='recurrent_kernel',
         initializer=self.recurrent_initializer,
         regularizer=self.recurrent_regularizer,
@@ -737,9 +737,9 @@ class GRU(Recurrent):
 
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.units * 3,),
+          shape=(self.units * 3,),
           name='bias',
-          initializer='zero',
+          initializer=self.bias_initializer,
           regularizer=self.bias_regularizer,
           constraint=self.bias_constraint)
     else:
@@ -1039,13 +1039,13 @@ class LSTM(Recurrent):
       self.reset_states()
 
     self.kernel = self.add_weight(
-        (self.input_dim, self.units * 4),
+        shape=(self.input_dim, self.units * 4),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
-        (self.units, self.units * 4),
+        shape=(self.units, self.units * 4),
         name='recurrent_kernel',
         initializer=self.recurrent_initializer,
         regularizer=self.recurrent_regularizer,
@@ -1053,7 +1053,7 @@ class LSTM(Recurrent):
 
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.units * 4,),
+          shape=(self.units * 4,),
           name='bias',
           initializer=self.bias_initializer,
           regularizer=self.bias_regularizer,
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers.py b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
index ce6458fd0c876dfa5e5a29080a273feeeca43a5f..092501cb1149d30f987c34934871c1bae55eccdf 100644
--- a/tensorflow/contrib/keras/python/keras/layers/wrappers.py
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
@@ -166,6 +166,7 @@ class TimeDistributed(Wrapper):
       self.layer.build(child_input_shape)
       self.layer.built = True
     super(TimeDistributed, self).build()
+    self.built = True
 
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
diff --git a/tensorflow/contrib/keras/python/keras/models.py b/tensorflow/contrib/keras/python/keras/models.py
index eb0996fa12956d0b30b331f850a384a9289cabfc..1c041091fc1619e90080234f0158a28602194d5e 100644
--- a/tensorflow/contrib/keras/python/keras/models.py
+++ b/tensorflow/contrib/keras/python/keras/models.py
@@ -35,6 +35,7 @@ from tensorflow.contrib.keras.python.keras.engine.topology import Input
 from tensorflow.contrib.keras.python.keras.engine.topology import Layer
 from tensorflow.contrib.keras.python.keras.engine.training import Model
 from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.framework import ops
 
 
 # pylint: disable=g-import-not-at-top
@@ -220,7 +221,7 @@ def load_model(filepath, custom_objects=None):
         obj: object, dict, or list.
 
     Returns:
-        The same structure, where occurences
+        The same structure, where occurrences
             of a custom object name have been replaced
             with the custom object.
     """
@@ -420,6 +421,14 @@ class Sequential(Model):
       name = prefix + str(K.get_uid(prefix))
     self.name = name
 
+    # The following properties are not actually used by Keras;
+    # they exist for compatibility with TF's variable scoping mechanism.
+    self._updates = []
+    self._scope = None
+    self._reuse = None
+    self._base_name = name
+    self._graph = ops.get_default_graph()
+
     # Add to the model any layers passed to the constructor.
     if layers:
       for layer in layers:
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
index 5a24a63b0140620b9ffc7ef9621e00b903d986c3..692a359ead371b858ad447a1c71dc781c40a1bba 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
@@ -156,7 +156,7 @@ def skipgrams(sequence,
           of word indices (integers). If using a `sampling_table`,
           word indices are expected to match the rank
           of the words in a reference dataset (e.g. 10 would encode
-          the 10-th most frequently occuring token).
+          the 10-th most frequently occurring token).
           Note that index 0 is expected to be a non-word and will be skipped.
       vocabulary_size: int. maximum possible word index + 1
       window_size: int. actually half-window.
diff --git a/tensorflow/contrib/kernel_methods/README.md b/tensorflow/contrib/kernel_methods/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..44ed9670a09ece8fb11e79a3e58725e2a54e513b
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/README.md
@@ -0,0 +1,53 @@
+# TensorFlow contrib kernel_methods.
+
+This module contains operations and estimators that enable the use of primal
+(explicit) kernel methods in TensorFlow. See also the [tutorial](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/g3doc/tutorial.md) on how to use this module to improve the quality of
+classification or regression tasks.
+
+## Kernel Mappers
+Implement explicit kernel mapping Ops over tensors. Kernel mappers add
+Tensor-In-Tensor-Out (TITO) Ops to the TensorFlow graph. They can be used in
+conjunction with other layers or ML models.
+
+Sample usage:
+
+```python
+kernel_mapper = tf.contrib.kernel_methods.SomeKernelMapper(...)
+out_tensor = kernel_mapper.map(in_tensor)
+...  # code that consumes out_tensor.
+```
+
+Currently, there is a [RandomFourierFeatureMapper](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py) implemented that maps dense input to dense
+output. More mappers are on the way.
+
+## Kernel-based Estimators
+These are estimators inheriting from the @{tf.contrib.learn.Estimator} class and
+use kernel mappers internally to discover non-linearities in the data. These
+canned estimators map their input features using kernel mapper Ops and then
+apply linear models to the mapped features. Combining kernel mappers with linear
+models and different loss functions leads to a variety of models: linear and
+non-linear SVMs, linear regression (with and without kernels) and (multinomial)
+logistic regression (with and without kernels).
+
+Currently there is a [KernelLinearClassifier](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/python/kernel_estimators.py) implemented but more pre-packaged estimators
+are on the way.
+
+Sample usage:
+
+```python
+real_column_a = tf.contrib.layers.real_valued_column(name='real_column_a',...)
+sparse_column_b = tf.contrib.layers.sparse_column_with_hash_bucket(...)
+kernel_mappers = {real_column_a : [tf.contrib.kernel_methods.SomeKernelMapper(...)]}
+optimizer = ...
+
+kernel_classifier = tf.contrib.kernel_methods.KernelLinearClassifier(
+    feature_columns=[real_column_a, sparse_column_b],
+    model_dir=...,
+    optimizer=optimizer,
+    kernel_mappers=kernel_mappers)
+
+# Construct input_fns
+kernel_classifier.fit(...)
+kernel_classifier.evaluate(...)
+```
+
diff --git a/tensorflow/contrib/kernel_methods/g3doc/acc-vs-trn_time.png b/tensorflow/contrib/kernel_methods/g3doc/acc-vs-trn_time.png
new file mode 100644
index 0000000000000000000000000000000000000000..1028bb390179cd21dba0b41b53a0b1eff4e345a4
Binary files /dev/null and b/tensorflow/contrib/kernel_methods/g3doc/acc-vs-trn_time.png differ
diff --git a/tensorflow/contrib/kernel_methods/g3doc/acc_vs_outdim.png b/tensorflow/contrib/kernel_methods/g3doc/acc_vs_outdim.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3384e053b24d0225576a8610147b042fb7b1708
Binary files /dev/null and b/tensorflow/contrib/kernel_methods/g3doc/acc_vs_outdim.png differ
diff --git a/tensorflow/contrib/kernel_methods/g3doc/kernel_mapping.png b/tensorflow/contrib/kernel_methods/g3doc/kernel_mapping.png
new file mode 100644
index 0000000000000000000000000000000000000000..e63303dab45d671acbeb839b22726ac0877dffae
Binary files /dev/null and b/tensorflow/contrib/kernel_methods/g3doc/kernel_mapping.png differ
diff --git a/tensorflow/contrib/kernel_methods/g3doc/tutorial.md b/tensorflow/contrib/kernel_methods/g3doc/tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..64c2adf9f39f8b49ca2135d5084a526789c1f4fd
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/g3doc/tutorial.md
@@ -0,0 +1,279 @@
+# Improving Linear Models Using Explicit Kernel Methods
+
+In this tutorial, we demonstrate how combining (explicit) kernel methods with
+linear models can drastically increase the latters' quality of predictions
+without significantly increasing training and inference times. Unlike dual
+kernel methods, explicit (primal) kernel methods scale well with the size of the
+training dataset both in terms of training/inference times and in terms of
+memory requirements.
+
+Currently, explicit kernel mappings are supported for dense features. Support
+for sparse features is in the works.
+
+We will use [tf.contrib.learn](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn) (TensorFlow's high-level Machine Learning API) Estimators for our ML models. The
+tf.contrib.learn API reduces the boilerplate code one needs to write for
+configuring, training and evaluating models and will let us focus on the core
+ideas. If you are not familiar with this API, [tf.contrib.learn Quickstart](https://www.tensorflow.org/get_started/tflearn) is a good place to start. We
+will use MNIST, a widely-used dataset containing images of handwritten digits
+(between 0 and 9). The tutorial consists of the following steps:
+
+* Load and prepare MNIST data for classification.
+* Construct a simple linear model, train it and evaluate it on the eval data.
+* Replace the linear model with a kernelized linear model, re-train and
+re-evaluate.
+
+## Load and prepare MNIST data for classification
+The first step is to prepare the data to be fed to the ML models. The following
+utility command from tf.contrib.learn loads the MNIST dataset:
+
+```python
+data = tf.contrib.learn.datasets.mnist.load_mnist()
+```
+This loads the entire MNIST dataset (containing 70K samples) and splits it into
+train, validation and test data with 55K, 5K and 10K samples respectively. Each
+split contains one numpy array for images (with shape [sample_size, 784]) and
+one for labels (with shape [sample_size, 1]). In this tutorial, we only use the
+train and validation splits (to train and evaluate our models respectively).
+
+In order to feed data to a tf.contrib.learn Estimator, it is helpful to convert
+it to Tensors. For this, we will use an `input function` which adds Ops to the
+TensorFlow graph that, when executed, create mini-batches of Tensors to be used
+downstream. For more background on input functions, check
+[Building Input Functions with tf.contrib.learn](https://www.tensorflow.org/get_started/input_fn).
+In this example, we will use the `tf.train.shuffle_batch` Op which, besides
+converting numpy arrays to Tensors, allows us to specify the batch_size and
+whether to randomize the input every time the input_fn Ops are executed
+(randomization typically expedites convergence during training). The full code
+for loading and preparing the data is shown in the snippet below. In this
+example, we use mini-batches of size 256 for training and the entire sample (5K
+entries) for evaluation. Feel free to experiment with different batch sizes.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+def get_input_fn(dataset_split, batch_size, capacity=10000, min_after_dequeue=3000):
+
+  def _input_fn():
+    images_batch, labels_batch = tf.train.shuffle_batch(
+        tensors=[dataset_split.images, dataset_split.labels.astype(np.int32)],
+        batch_size=batch_size,
+        capacity=capacity,
+        min_after_dequeue=min_after_dequeue,
+        enqueue_many=True,
+        num_threads=4)
+    features_map = {'images': images_batch}
+    return features_map, labels_batch
+
+  return _input_fn
+
+data = tf.contrib.learn.datasets.mnist.load_mnist()
+
+train_input_fn = get_input_fn(data.train, batch_size=256)
+eval_input_fn = get_input_fn(data.validation, batch_size=5000)
+
+```
+
+## Training a simple linear model
+We can now train a linear model over the MNIST dataset. We will use the
+[tf.contrib.learn.LinearClassifier](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py) estimator with 10 classes (representing the 10 digits).
+The input features form a 784-dimensional (dense) vector which can be specified
+as follows:
+
+```python
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+```
+
+The full code for constructing, training and evaluating a LinearClassifier
+estimator is shown below.
+
+```python
+import time
+
+# Specify the feature(s) to be used by the estimator.
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+estimator = tf.contrib.learn.LinearClassifier(feature_columns=[image_column], n_classes=10)
+
+# Train.
+start = time.time()
+estimator.fit(input_fn=train_input_fn, steps=2000)
+end = time.time()
+print('Elapsed time: {} seconds'.format(end - start))
+
+# Evaluate and report metrics.
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+print(eval_metrics)
+```
+On eval data, the loss (i.e., the value of the objective function being
+minimized during training) lies between **0.25** and **0.30** (depending on the
+parameters used) while the accuracy of the classifier is approximately **92.5%**
+(training is randomized so the exact loss and accuracy will vary). Also, the
+training time is around 25 seconds (this will also vary based on the machine you
+run the code on).
+
+In addition to experimenting with the (training) batch size and the number of
+training steps, there are a couple other parameters that can be tuned as well.
+For instance, you can change the optimization method used to minimize the loss
+by explicitly selecting another optimizer from the collection of
+[available optimizers](https://www.tensorflow.org/code/tensorflow/python/training).
+As an example, the following code constructs a LinearClassifier estimator that
+uses the Follow-The-Regularized-Leader (FTRL) optimization strategy with a
+specific learning rate and L2-regularization.
+
+
+```python
+optimizer = tf.train.FtrlOptimizer(learning_rate=5.0, l2_regularization_strength=1.0)
+estimator = tf.contrib.learn.LinearClassifier(
+    feature_columns=[image_column], n_classes=10, optimizer=optimizer)
+```
+
+Regardless of the values of the parameters, the max accuracy a linear model can
+achieve on this dataset caps at around **93%**.
+
+## Using explicit kernel mappings with the linear model.
+The relatively high error (~7%) of the linear model over MNIST indicates that
+the input data is not linearly separable. We will use explicit kernel mappings
+to reduce the classification error.
+
+**Intuition:** The high-level idea is to use a non-linear map to transform the
+input space to another feature space (of possibly higher dimension) where the
+(transformed) features are (almost) linearly separable and then apply a linear
+model on the mapped features. This is shown in the following figure:
+
+![image](./kernel_mapping.png)
+
+**Technical details overview:** In this example we will use **Random Fourier
+Features** (introduced in the
+["Random Features for Large-Scale Kernel Machines"](https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf) paper by
+Rahimi and Recht) to map the input data. Random Fourier Features map a vector
+\\(\mathbf{x} \in \mathbb{R}^d\\) to \\(\mathbf{x'} \in \mathbb{R}^D\\) via the
+following mapping:
+
+$$
+RFFM(\cdot): \mathbb{R}^d \to \mathbb{R}^D, \quad
+RFFM(\mathbf{x}) =  \cos(\mathbf{\Omega} \cdot \mathbf{x}+ \mathbf{b})
+$$
+
+where \\(\mathbf{\Omega} \in \mathbb{R}^{D \times d}\\),
+\\(\mathbf{x} \in \mathbb{R}^d,\\) \\(\mathbf{b} \in \mathbb{R}^D\\) and the
+cosine is applied element-wise.
+
+In this example, the entries of \\(\mathbf{\Omega}\\) and \\(\mathbf{b}\\) are
+sampled from distributions such that the mapping satisfies the following
+property:
+
+$$
+RFFM(\mathbf{x})^T \cdot RFFM(\mathbf{y}) \approx
+e^{-\frac{\|\mathbf{x} - \mathbf{y}\|^2}{2 \sigma^2}}
+$$
+
+The right-hand-side quantity of the expression above is known as the RBF (or
+Gaussian) kernel function. This function is one of the most-widely used kernel
+functions in Machine Learning and measures (implicitly) similarity in a
+different (much higher dimensional) space than the original one. See
+[Radial basis function kernel](https://en.wikipedia.org/wiki/Radial_basis_function_kernel)
+for more details.
+
+**Kernel Classifier:** `tf.contrib.kernel_methods.KernelLinearClassifier` is a
+pre-packaged `tf.contrib.learn` estimator that combines the power of explicit
+kernel mappings with linear models. Its API is very similar to that of the
+LinearClassifier with the additional ability to specify a list of explicit
+kernel mappings to be applied to each feature used by the classifier. The
+following code snippet demonstrates how to replace LinearClassifier with
+KernelLinearClassifier.
+
+
+```python
+# Specify the feature(s) to be used by the estimator. This is identical to the
+# code used for the LinearClassifier.
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+optimizer = tf.train.FtrlOptimizer(
+   learning_rate=50.0, l2_regularization_strength=0.001)
+
+
+kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
+  input_dim=784, output_dim=2000, stddev=5.0, name='rffm')
+kernel_mappers = {image_column: [kernel_mapper]}
+estimator = tf.contrib.kernel_methods.KernelLinearClassifier(
+   n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers)
+
+# Train.
+start = time.time()
+estimator.fit(input_fn=train_input_fn, steps=2000)
+end = time.time()
+print('Elapsed time: {} seconds'.format(end - start))
+
+# Evaluate and report metrics.
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+print(eval_metrics)
+```
+The only additional parameter passed to `KernelLinearClassifier` is a dictionary
+from feature_columns to a list of kernel mappings to be applied to the
+corresponding feature column. In this example, the lines
+
+```python
+kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
+  input_dim=784, output_dim=2000, stddev=5.0, name='rffm')
+kernel_mappers = {image_column: [kernel_mapper]}
+estimator = tf.contrib.kernel_methods.KernelLinearClassifier(
+   n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers)
+```
+instruct the classifier to first map the initial 784-dimensional images to
+2000-dimensional vectors using random Fourier features and then learn a linear
+model on the transformed vectors. Note that, besides the output dimension, there
+is one more parameter (stddev) involved. This parameter is the standard
+deviation (\\(\sigma\\)) of the approximated RBF kernel and controls the
+similarity measure used in classification. This parameter is typically
+determined via hyperparameter tuning.
+
+Running the code above yields a loss of approximately **0.10** while the
+accuracy is increased to approximately **97%** on eval data (an increase of 4%
+over the plain linear model). The training time hovers around 35 seconds. We can
+increase the accuracy even more, by increasing the output dimension of the
+mapping and tuning the standard deviation even more.
+
+**On the role of stddev:** The classification quality is very sensitive to the
+value of the stddev parameter used to define the similarity measure between the
+pairs of input features. The following table shows the accuracy of the
+classifier on the eval data for different values of stddev (for all experiments
+the output dimension was fixed to 3000). The optimal value is stddev=5.0. Notice
+how too small or too high stddev values can dramatically decrease the accuracy
+of the classification.
+
+stddev | eval accuracy
+:----- | :------------
+1.0    | 0.1362
+2.0    | 0.4764
+4.0    | 0.9654
+5.0    | 0.9766
+8.0    | 0.9714
+16.0   | 0.8878
+
+**On the role of the output dimension:** Intuitively, the larger the output
+dimension of the mapping, the closer the inner product of two mapped vectors
+approximates the kernel which typically translates to better classification
+accuracy. Another way to think about this is that the output dimension equals
+the number of weights of the linear model (the larger this dimension, the larger
+the "degrees of freedom" of the model). However, after a certain threshold,
+higher output dimensions increase the accuracy by very little (while still
+increasing the training time). This is shown in the following 2 Figures which
+depict the eval accuracy as a function of the output dimension and the training
+time respectively.
+
+![image](./acc_vs_outdim.png)  ![image](./acc-vs-trn_time.png)
+
+
+## Explicit kernel mappings: summary and practical tips
+* Explicit kernel mappings combine the predictive power of non-linear models
+with the scalability of linear models.
+* Unlike traditional dual kernel methods, they can scale to millions or hundreds
+of millions of samples.
+* Random Fourier Features can be particularly effective for datasets with dense
+features.
+* The parameters of the kernel mapping are often data-dependent. Model quality
+can be very sensitive to these parameters. Use hyperparameter tuning to find the
+optimal values.
+* If you have multiple numerical features, concatinate them into a single
+multi-dimensional feature and apply the kernel mapping to the concatenated
+vector.
+
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/core.py b/tensorflow/contrib/labeled_tensor/python/ops/core.py
index 393c7f93f367e3fea9b91ebce773bd1596a77cf4..e6aded92ca5597985b185d8e63c48947bb1692ca 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/core.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/core.py
@@ -810,7 +810,7 @@ def axis_order_scope(axis_order=None):
   Example usage:
 
     with lt.axis_order_scope(['x', 'y', 'z']):
-      # result is guranteed to have the correct axis order
+      # result is guaranteed to have the correct axis order
       result = w + b
 
   You can nest scopes, in which case only the inner-most scope applies, e.g.,
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops.py b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
index 98842494face96158d24bd89caaf5532024a39ef..c957b41a49b292225e547ce17b0c5a247810325a 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
@@ -51,8 +51,7 @@ def _gather_1d_on_axis(labeled_tensor, indexer, axis, name=None):
 @tc.returns(core.LabeledTensor)
 @tc.accepts(core.LabeledTensorLike,
             tc.Mapping(string_types,
-                       tc.Union(slice, collections.Hashable,
-                                collections.Sequence)),
+                       tc.Union(slice, collections.Hashable, list)),
             tc.Optional(string_types))
 def select(labeled_tensor, selection, name=None):
   """Slice out a subset of the tensor.
@@ -110,23 +109,22 @@ def select(labeled_tensor, selection, name=None):
 
         slices[axis_name] = slice(start, stop)
 
-      else:
-        # We're allowing anything NumPy treats as a scalar or 1D array.
-        value = np.asarray(value)
-        if value.ndim == 0:
-          slices[axis_name] = axis.index(value.item())
-        elif value.ndim == 1:
-          if indexers:
-            raise NotImplementedError(
-                'select does not yet support more than one list selection at '
-                'the same time')
-          indexer = [axis.index(v) for v in value.tolist()]
-          indexers[axis_name] = ops.convert_to_tensor(
-              indexer, dtype=dtypes.int64)
-        else:
+      # Needs to be after checking for slices, since slice objects claim to be
+      # instances of collections.Hashable but hash() on them fails.
+      elif isinstance(value, collections.Hashable):
+        slices[axis_name] = axis.index(value)
+
+      elif isinstance(value, list):
+        if indexers:
           raise NotImplementedError(
-              'select does not yet support selections with more than one '
-              'dimension: %s on axis %r' % (value, axis_name))
+              'select does not yet support more than one list selection at '
+              'the same time')
+        indexer = [axis.index(v) for v in value]
+        indexers[axis_name] = ops.convert_to_tensor(indexer, dtype=dtypes.int64)
+
+      else:
+        # If type checking is working properly, this shouldn't be possible.
+        raise TypeError('cannot handle arbitrary types')
 
     if indexers and slices:
       raise NotImplementedError(
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
index ea5e008752391053cbe77b88f927642be07a125a..0727f4cf88728dc3d919e662d65c93a658ac730b 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
@@ -121,6 +121,13 @@ class SelectTest(Base):
     golden_lt = core.LabeledTensor(self.tensor[1, 1, :, :], [self.a2, self.a3])
     self.assertLabeledTensorsEqual(select_lt, golden_lt)
 
+  def test_tuple(self):
+    original_lt = core.LabeledTensor(constant_op.constant([5, 6]),
+                                     [('x', [(1, 2), (3, 4)])])
+    select_lt = ops.select(original_lt, {'x': (1, 2)})
+    golden_lt = core.LabeledTensor(constant_op.constant(5), [])
+    self.assertLabeledTensorsEqual(select_lt, golden_lt)
+
   def test_invalid_input(self):
     with self.assertRaises(ValueError):
       ops.select(self.original_lt, {'foo': 1})
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index fc33e4b49e0147f10343354e1b1ddfa9bfd947c7..03af3771495fb3919062f1cafd3b757f87c2344d 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -18,35 +18,6 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
-tf_custom_op_library(
-    # TODO(sibyl-Mooth6ku,ptucker): Understand why 'python/ops/_' is needed and fix it.
-    name = "python/ops/_bucketization_op.so",
-    srcs = [
-        "ops/bucketization_op.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/layers/kernels:bucketization_kernel",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["bucketization_op"],
-)
-
-tf_gen_op_wrapper_py(
-    name = "bucketization_op",
-    deps = [":bucketization_op_op_lib"],
-)
-
-tf_kernel_library(
-    name = "bucketization_op_kernel",
-    deps = [
-        "//tensorflow/contrib/layers/kernels:bucketization_kernel",
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
-)
-
 tf_custom_op_library(
     # TODO(sibyl-Mooth6ku,ptucker): Understand why 'python/ops/_' is needed and fix it.
     name = "python/ops/_sparse_feature_cross_op.so",
@@ -97,18 +68,14 @@ tf_custom_op_py_library(
         "python/ops/sparse_ops.py",
     ],
     dso = [
-        ":python/ops/_bucketization_op.so",
         ":python/ops/_sparse_feature_cross_op.so",
     ],
     kernels = [
-        ":bucketization_op_kernel",
         ":sparse_feature_cross_op_kernel",
-        ":bucketization_op_op_lib",
         ":sparse_feature_cross_op_op_lib",
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":bucketization_op",
         ":sparse_feature_cross_op",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/lookup:lookup_py",
@@ -141,6 +108,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
         "@six_archive//:six",
     ],
 )
@@ -315,22 +283,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "bucketization_op_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bucketization_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":layers_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "sparse_feature_cross_op_test",
     size = "medium",
@@ -352,6 +304,7 @@ py_test(
 py_test(
     name = "embedding_ops_test",
     size = "small",
+    timeout = "moderate",
     srcs = ["python/layers/embedding_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/layers/README.md b/tensorflow/contrib/layers/README.md
index 7b374172f5c036fc99e37b05401c305b08814105..9310b194dff99529be6f1d5875bf242e810ef041 100644
--- a/tensorflow/contrib/layers/README.md
+++ b/tensorflow/contrib/layers/README.md
@@ -18,17 +18,14 @@ these arguments.
      …,
      weight_init=<DEFAULT>,
      bias_init=<DEFAULT>,
-     weight_collections=(tf.GraphKeys.WEIGHTS,),
-     bias_collections=(tf.GraphKeys.BIASES,),
-     output_collections=(tf.GraphKeys.ACTIVATIONS,),
      weight_regularizer=None,
      bias_regularizer=None,
      name=None) : Tensor`
 
 `x` is the input tensor.
 
-Weights, biases, and activations (i.e., outputs) are, by default, added to the specified collections. Weights and biases are also added to
-`tf.GraphKeys.GLOBAL_VARIABLES` and `tf.GraphKeys.TRAINABLE_VARIABLES`.
+Weights and biases are added to `tf.GraphKeys.GLOBAL_VARIABLES` and
+`tf.GraphKeys.TRAINABLE_VARIABLES`.
 
 ## optimizers.py
 
diff --git a/tensorflow/contrib/layers/kernels/BUILD b/tensorflow/contrib/layers/kernels/BUILD
index 7a2d6d8c4f714891e875db89b1cc770aa0b6b4db..15b984f93893b9da3a202129b7532c37338fb4d4 100644
--- a/tensorflow/contrib/layers/kernels/BUILD
+++ b/tensorflow/contrib/layers/kernels/BUILD
@@ -7,17 +7,6 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
-cc_library(
-    name = "bucketization_kernel",
-    srcs = ["bucketization_kernel.cc"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
-        "@protobuf//:protobuf_headers",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "sparse_feature_cross_kernel",
     srcs = ["sparse_feature_cross_kernel.cc"],
diff --git a/tensorflow/contrib/layers/ops/bucketization_op.cc b/tensorflow/contrib/layers/ops/bucketization_op.cc
deleted file mode 100644
index d90d47a1eb4ecc2c0ba49506ead955eaf40fa81c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/layers/ops/bucketization_op.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-
-namespace tensorflow {
-
-REGISTER_OP("Bucketize")
-    .Input("input: T")
-    .Output("output: int32")
-    .Attr("T: {int32, int64, float, double}")
-    .Attr("boundaries: list(float)")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Bucketizes 'input' based on 'boundaries'.
-
-For example, if the inputs are
-    boundaries = [0, 10, 100]
-    input = [[-5, 10000]
-             [150,   10]
-             [5,    100]]
-
-then the output will be
-    output = [[0, 3]
-              [3, 2]
-              [1, 3]]
-
-input: Any shape of Tensor contains with int or float type.
-boundaries: A sorted list of floats gives the boundary of the buckets.
-output: Same shape with 'input', each value of input replaced with bucket index.
-
-)doc");
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index f231ee38597561a6db19b8d1a23b38aaafb43ceb..f8f4122d1db4470701cd1d9599add842349943f4 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -100,7 +100,13 @@ def safe_embedding_lookup_sparse(embedding_weights,
     logging.warn("The default value of combiner will change from \"mean\" "
                  "to \"sqrtn\" after 2016/11/01.")
     combiner = "mean"
-  if embedding_weights is None or len(embedding_weights) < 1:
+  if embedding_weights is None:
+    raise ValueError("Missing embedding_weights %s." % embedding_weights)
+  if isinstance(embedding_weights, variables.PartitionedVariable):
+    embedding_weights = list(embedding_weights)  # get underlying Variables.
+  if not isinstance(embedding_weights, list):
+    embedding_weights = [embedding_weights]
+  if len(embedding_weights) < 1:
     raise ValueError("Missing embedding_weights %s." % embedding_weights)
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
diff --git a/tensorflow/contrib/layers/python/layers/encoders.py b/tensorflow/contrib/layers/python/layers/encoders.py
index 3afdbb18273ff1ac9b4cafab0d7e1b49b4bc15f1..89c9d37bd09cb6c43eebb91f3a16600eae9cb490 100644
--- a/tensorflow/contrib/layers/python/layers/encoders.py
+++ b/tensorflow/contrib/layers/python/layers/encoders.py
@@ -121,7 +121,7 @@ def embed_sequence(ids,
     `Tensor` of `[batch_size, doc_length, embed_dim]` with embedded sequences.
 
   Raises:
-    ValueError: if `embed_dim` or `vocab_size` are not specified when 
+    ValueError: if `embed_dim` or `vocab_size` are not specified when
       `reuse` is `None` or `False`.
   """
   if not (reuse or (vocab_size and embed_dim)):
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index d6d5bf2294f5f3517c1665da81c68c7064df41fd..e1a27335abe34041fbad3d59f0f336e088a10319 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -131,19 +131,27 @@ import math
 import six
 
 from tensorflow.contrib import lookup
+from tensorflow.contrib.framework.python.framework import checkpoint_utils
 from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import embedding_ops
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.layers.python.ops import bucketization_op
 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op
 from tensorflow.contrib.layers.python.ops import sparse_ops as contrib_sparse_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_py
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 
@@ -289,11 +297,13 @@ class _FeatureColumn(object):
 
 
 # TODO(b/30410315): Support warm starting in all feature columns.
-class _SparseColumn(_FeatureColumn,
-                    collections.namedtuple("_SparseColumn",
-                                           ["column_name", "is_integerized",
-                                            "bucket_size", "lookup_config",
-                                            "combiner", "dtype"])):
+class _SparseColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_SparseColumn", [
+        "column_name", "is_integerized", "bucket_size", "lookup_config",
+        "combiner", "dtype"
+    ])):
   """Represents a sparse feature column also known as categorical features.
 
   Instances of this class are immutable. A sparse column means features are
@@ -424,9 +434,8 @@ class _SparseColumn(_FeatureColumn,
         initializer=init_ops.zeros_initializer(),
         combiner=self.combiner)
 
-  def _get_input_sparse_tensor(self, columns_to_tensors):
-    """Looks up the input tensor for transformation and sparsify it if dense."""
-    input_tensor = columns_to_tensors[self.name]
+  def _get_input_sparse_tensor(self, input_tensor):
+    """sparsify input_tensor if dense."""
     if not isinstance(input_tensor, sparse_tensor_py.SparseTensor):
       # To avoid making any assumptions about which values are to be ignored,
       # we set ignore_value to -1 for numeric tensors to avoid excluding valid
@@ -442,7 +451,7 @@ class _SparseColumn(_FeatureColumn,
     return input_tensor
 
   def is_compatible(self, other_column):
-    """Check compatability of two sparse columns."""
+    """Check compatibility of two sparse columns."""
     if self.lookup_config and other_column.lookup_config:
       return self.lookup_config == other_column.lookup_config
     compatible = (self.length == other_column.length and
@@ -453,18 +462,44 @@ class _SparseColumn(_FeatureColumn,
                    format(self.name, other_column.name))
     return compatible
 
-
-class _SparseColumnIntegerized(_SparseColumn):
-  """See `sparse_column_with_integerized_feature`."""
+  @abc.abstractmethod
+  def _do_transform(self, input_tensor):
+    pass
 
   def insert_transformed_feature(self, columns_to_tensors):
     """Handles sparse column to id conversion."""
-    input_tensor = self._get_input_sparse_tensor(columns_to_tensors)
+    input_tensor = self._get_input_sparse_tensor(columns_to_tensors[self.name])
+    columns_to_tensors[self] = self._do_transform(input_tensor)
+
+  def _transform_feature(self, inputs):
+    input_tensor = self._get_input_sparse_tensor(inputs.get(self.name))
+    return self._do_transform(input_tensor)
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
+  @property
+  def _num_buckets(self):
+    return self.length
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
+        self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
 
+
+class _SparseColumnIntegerized(_SparseColumn):
+  """See `sparse_column_with_integerized_feature`."""
+
+  def _do_transform(self, input_tensor):
     sparse_id_values = math_ops.mod(input_tensor.values, self.bucket_size,
                                     name="mod")
-    columns_to_tensors[self] = sparse_tensor_py.SparseTensor(
-        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
+    return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values,
+                                         input_tensor.dense_shape)
 
 
 def sparse_column_with_integerized_feature(column_name,
@@ -515,10 +550,7 @@ def sparse_column_with_integerized_feature(column_name,
 class _SparseColumnHashed(_SparseColumn):
   """See `sparse_column_with_hash_bucket`."""
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    input_tensor = self._get_input_sparse_tensor(columns_to_tensors)
-
+  def _do_transform(self, input_tensor):
     if self.dtype.is_integer:
       sparse_values = string_ops.as_string(input_tensor.values)
     else:
@@ -526,8 +558,8 @@ class _SparseColumnHashed(_SparseColumn):
 
     sparse_id_values = string_ops.string_to_hash_bucket_fast(
         sparse_values, self.bucket_size, name="lookup")
-    columns_to_tensors[self] = sparse_tensor_py.SparseTensor(
-        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
+    return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values,
+                                         input_tensor.dense_shape)
 
 
 def sparse_column_with_hash_bucket(column_name,
@@ -570,16 +602,13 @@ def sparse_column_with_hash_bucket(column_name,
 class _SparseColumnKeys(_SparseColumn):
   """See `sparse_column_with_keys`."""
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    input_tensor = self._get_input_sparse_tensor(columns_to_tensors)
-
+  def _do_transform(self, input_tensor):
     table = lookup.index_table_from_tensor(
         mapping=tuple(self.lookup_config.keys),
         default_value=self.lookup_config.default_value,
         dtype=self.dtype,
         name="lookup")
-    columns_to_tensors[self] = table.lookup(input_tensor)
+    return table.lookup(input_tensor)
 
 
 def sparse_column_with_keys(
@@ -619,9 +648,7 @@ def sparse_column_with_keys(
 class _SparseColumnVocabulary(_SparseColumn):
   """See `sparse_column_with_vocabulary_file`."""
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    st = self._get_input_sparse_tensor(columns_to_tensors)
+  def _do_transform(self, st):
     if self.dtype.is_integer:
       sparse_string_values = string_ops.as_string(st.values)
       sparse_string_tensor = sparse_tensor_py.SparseTensor(st.indices,
@@ -636,7 +663,7 @@ class _SparseColumnVocabulary(_SparseColumn):
         vocab_size=self.lookup_config.vocab_size,
         default_value=self.lookup_config.default_value,
         name=self.name + "_lookup")
-    columns_to_tensors[self] = table.lookup(sparse_string_tensor)
+    return table.lookup(sparse_string_tensor)
 
 
 def sparse_column_with_vocabulary_file(column_name,
@@ -692,9 +719,12 @@ def sparse_column_with_vocabulary_file(column_name,
       dtype=dtype)
 
 
-class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
-    "_WeightedSparseColumn",
-    ["sparse_id_column", "weight_column_name", "dtype"])):
+class _WeightedSparseColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_WeightedSparseColumn",
+                           ["sparse_id_column", "weight_column_name",
+                            "dtype"])):
   """See `weighted_sparse_column`."""
 
   def __new__(cls, sparse_id_column, weight_column_name, dtype):
@@ -723,22 +753,6 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
     """Returns a string which will be used as a key when we do sorting."""
     return "{}".format(self)
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Inserts a tuple with the id and weight tensors."""
-    if self.sparse_id_column not in columns_to_tensors:
-      self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
-
-    weight_tensor = columns_to_tensors[self.weight_column_name]
-    if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor):
-      # The weight tensor can be a regular Tensor. In such case, sparsify it.
-      weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor)
-    if not self.dtype.is_floating:
-      weight_tensor = math_ops.to_float(weight_tensor)
-    columns_to_tensors[self] = tuple([
-        columns_to_tensors[self.sparse_id_column],
-        weight_tensor
-    ])
-
   def id_tensor(self, input_tensor):
     """Returns the id tensor from the given transformed input_tensor."""
     return input_tensor[0]
@@ -766,6 +780,43 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer(),
         combiner=self.sparse_id_column.combiner)
 
+  def _do_transform(self, id_tensor, weight_tensor):
+    if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor):
+      # The weight tensor can be a regular Tensor. In such case, sparsify it.
+      weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor)
+    if not self.dtype.is_floating:
+      weight_tensor = math_ops.to_float(weight_tensor)
+    return tuple([id_tensor, weight_tensor])
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Inserts a tuple with the id and weight tensors."""
+    if self.sparse_id_column not in columns_to_tensors:
+      self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
+
+    weight_tensor = columns_to_tensors[self.weight_column_name]
+    columns_to_tensors[self] = self._do_transform(
+        columns_to_tensors[self.sparse_id_column], weight_tensor)
+
+  def _transform_feature(self, inputs):
+    return self._do_transform(
+        inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name))
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
+  @property
+  def _num_buckets(self):
+    return self.length
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
+        self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
+
 
 def weighted_sparse_column(sparse_id_column,
                            weight_column_name,
@@ -813,9 +864,10 @@ def weighted_sparse_column(sparse_id_column,
   return _WeightedSparseColumn(sparse_id_column, weight_column_name, dtype)
 
 
-class _OneHotColumn(_FeatureColumn,
-                    collections.namedtuple("_OneHotColumn",
-                                           ["sparse_id_column"])):
+class _OneHotColumn(
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_OneHotColumn", ["sparse_id_column"])):
   """Represents a one-hot column for use in deep networks.
 
   Args:
@@ -895,12 +947,31 @@ class _OneHotColumn(_FeatureColumn,
     return math_ops.reduce_sum(
         one_hot_id_tensor, reduction_indices=[output_rank - 1])
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.length])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    return inputs.get(self)
 
-class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
-    "_EmbeddingColumn",
-    ["sparse_id_column", "dimension", "combiner", "initializer",
-     "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name",
-     "shared_vocab_size", "max_norm", "trainable"])):
+  def _transform_feature(self, inputs):
+    return self._to_dnn_input_layer(inputs.get(self.sparse_id_column))
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
+
+class _EmbeddingColumn(
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_EmbeddingColumn", [
+        "sparse_id_column", "dimension", "combiner", "initializer",
+        "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name",
+        "shared_vocab_size", "max_norm", "trainable"
+    ])):
   """Represents an embedding column.
 
   Args:
@@ -1025,6 +1096,139 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
     raise ValueError("Column {} is not supported in linear models. "
                      "Please use sparse_column.".format(self))
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return _embeddings_from_arguments(
+        self,
+        self._deep_embedding_lookup_arguments(inputs.get(self)),
+        weight_collections, trainable)
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.sparse_id_column)
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
+
+def _is_variable(v):
+  """Returns true if `v` is a variable."""
+  return isinstance(v, (variables.Variable,
+                        resource_variable_ops.ResourceVariable))
+
+
+def _embeddings_from_arguments(column,
+                               args,
+                               weight_collections,
+                               trainable,
+                               output_rank=2):
+  """Returns embeddings for a column based on the computed arguments.
+
+  Args:
+   column: the column name.
+   args: the _DeepEmbeddingLookupArguments for this column.
+   weight_collections: collections to store weights in.
+   trainable: whether these embeddings should be trainable.
+   output_rank: the desired rank of the returned `Tensor`. Inner dimensions will
+     be combined to produce the desired rank.
+
+  Returns:
+   the embeddings.
+
+  Raises:
+   ValueError: if not possible to create.
+  """
+  # pylint: disable=protected-access
+  input_tensor = layers._inner_flatten(args.input_tensor, output_rank)
+  weight_tensor = None
+  if args.weight_tensor is not None:
+    weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank)
+  # pylint: enable=protected-access
+
+  # This option is only enabled for scattered_embedding_column.
+  if args.hash_key:
+    embeddings = contrib_variables.model_variable(
+        name="weights",
+        shape=[args.vocab_size],
+        dtype=dtypes.float32,
+        initializer=args.initializer,
+        trainable=(trainable and args.trainable),
+        collections=weight_collections)
+
+    return embedding_ops.scattered_embedding_lookup_sparse(
+        embeddings,
+        input_tensor,
+        args.dimension,
+        hash_key=args.hash_key,
+        combiner=args.combiner,
+        name="lookup")
+
+  if args.shared_embedding_name is not None:
+    shared_embedding_collection_name = (
+        "SHARED_EMBEDDING_COLLECTION_" + args.shared_embedding_name.upper())
+    graph = ops.get_default_graph()
+    shared_embedding_collection = (
+        graph.get_collection_ref(shared_embedding_collection_name))
+    shape = [args.vocab_size, args.dimension]
+    if shared_embedding_collection:
+      if len(shared_embedding_collection) > 1:
+        raise ValueError(
+            "Collection %s can only contain one "
+            "(partitioned) variable." % shared_embedding_collection_name)
+      else:
+        embeddings = shared_embedding_collection[0]
+        if embeddings.get_shape() != shape:
+          raise ValueError(
+              "The embedding variable with name {} already "
+              "exists, but its shape does not match required "
+              "embedding shape  here. Please make sure to use "
+              "different shared_embedding_name for different "
+              "shared embeddings.".format(args.shared_embedding_name))
+    else:
+      embeddings = contrib_variables.model_variable(
+          name=args.shared_embedding_name,
+          shape=shape,
+          dtype=dtypes.float32,
+          initializer=args.initializer,
+          trainable=(trainable and args.trainable),
+          collections=weight_collections)
+      graph.add_to_collection(shared_embedding_collection_name, embeddings)
+  else:
+    embeddings = contrib_variables.model_variable(
+        name="weights",
+        shape=[args.vocab_size, args.dimension],
+        dtype=dtypes.float32,
+        initializer=args.initializer,
+        trainable=(trainable and args.trainable),
+        collections=weight_collections)
+
+  if _is_variable(embeddings):
+    embeddings = [embeddings]
+  else:
+    embeddings = embeddings._get_variable_list()  # pylint: disable=protected-access
+  # pylint: disable=protected-access
+  _maybe_restore_from_checkpoint(column._checkpoint_path(), embeddings)
+  return embedding_ops.safe_embedding_lookup_sparse(
+      embeddings,
+      input_tensor,
+      sparse_weights=weight_tensor,
+      combiner=args.combiner,
+      name=column.name + "weights",
+      max_norm=args.max_norm)
+
+
+def _maybe_restore_from_checkpoint(checkpoint_path, variable):
+  if checkpoint_path is not None:
+    path, tensor_name = checkpoint_path
+    weights_to_restore = variable
+    if len(variable) == 1:
+      weights_to_restore = variable[0]
+    checkpoint_utils.init_from_checkpoint(path,
+                                          {tensor_name: weights_to_restore})
+
 
 def one_hot_column(sparse_id_column):
   """Creates an `_OneHotColumn` for a one-hot or multi-hot repr in a DNN.
@@ -1192,10 +1396,11 @@ def shared_embedding_columns(sparse_id_columns,
 
 class _ScatteredEmbeddingColumn(
     _FeatureColumn,
-    collections.namedtuple(
-        "_ScatteredEmbeddingColumn",
-        ["column_name", "size", "dimension", "hash_key", "combiner",
-         "initializer"])):
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_ScatteredEmbeddingColumn", [
+        "column_name", "size", "dimension", "hash_key", "combiner",
+        "initializer"
+    ])):
   """See `scattered_embedding_column`."""
 
   def __new__(cls,
@@ -1248,6 +1453,23 @@ class _ScatteredEmbeddingColumn(
         max_norm=None,
         trainable=True)
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return _embeddings_from_arguments(
+        self,
+        self._deep_embedding_lookup_arguments(inputs.get(self)),
+        weight_collections, trainable)
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.column_name)
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
 
 def scattered_embedding_column(column_name,
                                size,
@@ -1427,6 +1649,15 @@ class _RealValuedVarLenColumn(_FeatureColumn, collections.namedtuple(
     input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
     columns_to_tensors[self] = math_ops.to_float(input_tensor)
 
+  # pylint: disable=unused-argument
+  def _to_dnn_input_layer(self,
+                          input_tensor,
+                          weight_collections=None,
+                          trainable=True,
+                          output_rank=2):
+    return _reshape_real_valued_tensor(
+        self._to_dense_tensor(input_tensor), output_rank, self.name)
+
   def _to_dense_tensor(self, input_tensor):
     if not self.is_sparse:
       return input_tensor
@@ -1497,9 +1728,12 @@ def _real_valued_var_len_column(column_name,
                                  is_sparse)
 
 
-class _RealValuedColumn(_FeatureColumn, collections.namedtuple(
-    "_RealValuedColumn",
-    ["column_name", "dimension", "default_value", "dtype", "normalizer"])):
+class _RealValuedColumn(
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple(
+        "_RealValuedColumn",
+        ["column_name", "dimension", "default_value", "dtype", "normalizer"])):
   """Represents a real valued feature column also known as continuous features.
 
   Instances of this class are immutable. The dictionary returned by InputBuilder
@@ -1569,6 +1803,23 @@ class _RealValuedColumn(_FeatureColumn, collections.namedtuple(
   def _to_dense_tensor(self, input_tensor):
     return input_tensor
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    return inputs.get(self)
+
+  def _transform_feature(self, inputs):
+    return math_ops.to_float(
+        self._normalized_input_tensor(inputs.get(self.name)))
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
 
 def real_valued_column(column_name,
                        dimension=1,
@@ -1684,8 +1935,12 @@ def real_valued_column(column_name,
                       default_value, dtype, column_name))
 
 
-class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
-    "_BucketizedColumn", ["source_column", "boundaries"])):
+class _BucketizedColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_BucketizedColumn", ["source_column",
+                                                 "boundaries"])):
   """Represents a bucketization transformation also known as binning.
 
   Instances of this class are immutable. Values in `source_column` will be
@@ -1755,15 +2010,6 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
     """Returns a string which will be used as a key when we do sorting."""
     return "{}".format(self)
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    # Bucketize the source column.
-    if self.source_column not in columns_to_tensors:
-      self.source_column.insert_transformed_feature(columns_to_tensors)
-    columns_to_tensors[self] = bucketization_op.bucketize(
-        columns_to_tensors[self.source_column],
-        boundaries=list(self.boundaries),
-        name="bucketize")
-
   # pylint: disable=unused-argument
   def _to_dnn_input_layer(self,
                           input_tensor,
@@ -1821,6 +2067,43 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer(),
         combiner="sum")
 
+  def _transform_feature(self, inputs):
+    """Handles cross transformation."""
+    # Bucketize the source column.
+    return bucketization_op.bucketize(
+        inputs.get(self.source_column),
+        boundaries=list(self.boundaries),
+        name="bucketize")
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Handles sparse column to id conversion."""
+    columns_to_tensors[self] = self._transform_feature(
+        _LazyBuilderByColumnsToTensor(columns_to_tensors))
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
+  @property
+  def _num_buckets(self):
+    return self.length * self.source_column.dimension
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
+        self.to_sparse_tensor(inputs.get(self)), None)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(
+        [self.length * self.source_column.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return self._to_dnn_input_layer(
+        inputs.get(self), weight_collections, trainable)
+
 
 def bucketized_column(source_column, boundaries):
   """Creates a _BucketizedColumn for discretizing dense input.
@@ -1839,13 +2122,14 @@ def bucketized_column(source_column, boundaries):
   return _BucketizedColumn(source_column, boundaries)
 
 
-class _CrossedColumn(_FeatureColumn,
-                     collections.namedtuple("_CrossedColumn",
-                                            ["columns", "hash_bucket_size",
-                                             "hash_key",
-                                             "combiner", "ckpt_to_load_from",
-                                             "tensor_name_in_ckpt"])):
-  """Represents a cross transformation also known as conjuction or combination.
+class _CrossedColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_CrossedColumn", [
+        "columns", "hash_bucket_size", "hash_key", "combiner",
+        "ckpt_to_load_from", "tensor_name_in_ckpt"
+    ])):
+  """Represents a cross transformation also known as conjunction or combination.
 
   Instances of this class are immutable. It crosses given `columns`. Crossed
   column output will be hashed to hash_bucket_size.
@@ -1962,12 +2246,37 @@ class _CrossedColumn(_FeatureColumn,
     """Returns the id tensor from the given transformed input_tensor."""
     return input_tensor
 
-  # pylint: disable=unused-argument
   def weight_tensor(self, input_tensor):
     """Returns the weight tensor from the given transformed input_tensor."""
+    del input_tensor
     return None
 
-  def insert_transformed_feature(self, columns_to_tensors):
+  def _to_dnn_input_layer(self,
+                          input_tensor,
+                          weight_collections=None,
+                          trainable=True,
+                          output_rank=2):
+    del input_tensor
+    del weight_collections
+    del trainable
+    del output_rank
+    raise ValueError("CrossedColumn is not supported in DNN. "
+                     "Please use embedding_column. column: {}".format(self))
+
+  def _checkpoint_path(self):
+    if self.ckpt_to_load_from is not None:
+      return self.ckpt_to_load_from, self.tensor_name_in_ckpt
+    return None
+
+  def _wide_embedding_lookup_arguments(self, input_tensor):
+    return _LinearEmbeddingLookupArguments(
+        input_tensor=input_tensor,
+        weight_tensor=None,
+        vocab_size=self.length,
+        initializer=init_ops.zeros_initializer(),
+        combiner=self.combiner)
+
+  def _transform_feature(self, inputs):
     """Handles cross transformation."""
 
     def _collect_leaf_level_columns(cross):
@@ -1983,42 +2292,57 @@ class _CrossedColumn(_FeatureColumn,
     feature_tensors = []
     for c in _collect_leaf_level_columns(self):
       if isinstance(c, _SparseColumn):
-        feature_tensors.append(columns_to_tensors[c.name])
+        feature_tensors.append(inputs.get(c.name))
       else:
-        if c not in columns_to_tensors:
-          c.insert_transformed_feature(columns_to_tensors)
         if isinstance(c, _BucketizedColumn):
-          feature_tensors.append(c.to_sparse_tensor(columns_to_tensors[c]))
+          feature_tensors.append(c.to_sparse_tensor(inputs.get(c)))
         else:
-          feature_tensors.append(columns_to_tensors[c])
-    columns_to_tensors[self] = sparse_feature_cross_op.sparse_feature_cross(
+          feature_tensors.append(inputs.get(c))
+    return sparse_feature_cross_op.sparse_feature_cross(
         feature_tensors,
         hashed_output=True,
         num_buckets=self.hash_bucket_size,
         hash_key=self.hash_key,
         name="cross")
 
-  # pylint: disable=unused-argument
-  def _to_dnn_input_layer(self,
-                          input_tensor,
-                          weight_collections=None,
-                          trainable=True,
-                          output_rank=2):
-    raise ValueError("CrossedColumn is not supported in DNN. "
-                     "Please use embedding_column. column: {}".format(self))
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Handles sparse column to id conversion."""
+    columns_to_tensors[self] = self._transform_feature(
+        _LazyBuilderByColumnsToTensor(columns_to_tensors))
 
-  def _checkpoint_path(self):
-    if self.ckpt_to_load_from is not None:
-      return self.ckpt_to_load_from, self.tensor_name_in_ckpt
-    return None
+  @property
+  def _parse_example_config(self):
+    return self.config
 
-  def _wide_embedding_lookup_arguments(self, input_tensor):
-    return _LinearEmbeddingLookupArguments(
-        input_tensor=input_tensor,
-        weight_tensor=None,
-        vocab_size=self.length,
-        initializer=init_ops.zeros_initializer(),
-        combiner=self.combiner)
+  @property
+  def _num_buckets(self):
+    return self.length
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return fc_core._CategoricalColumn.IdWeightPair(inputs.get(self), None)  # pylint: disable=protected-access
+
+
+class _LazyBuilderByColumnsToTensor(object):
+
+  def __init__(self, columns_to_tensors):
+    self._columns_to_tensors = columns_to_tensors
+
+  def get(self, key):
+    """Gets the transformed feature column."""
+    if key in self._columns_to_tensors:
+      return self._columns_to_tensors[key]
+    if isinstance(key, str):
+      raise ValueError(
+          "features dictionary doesn't contain key ({})".format(key))
+    if not isinstance(key, _FeatureColumn):
+      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
+                      "Provided: {}".format(key))
+
+    key.insert_transformed_feature(self._columns_to_tensors)
+    return self._columns_to_tensors[key]
 
 
 def crossed_column(columns, hash_bucket_size, combiner="sum",
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index 31aca87002a3b79d69947ae7fc02e83331476dee..fa0047f05d893f6543ddb1680824a32469e13293 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import functools
 
-from tensorflow.contrib.framework.python.framework import checkpoint_utils
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.layers.python.layers import embedding_ops
@@ -34,118 +33,12 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
-def _is_variable(v):
-  """Returns true if `v` is a variable."""
-  return isinstance(v, (variables.Variable,
-                        resource_variable_ops.ResourceVariable))
-
-
-def _embeddings_from_arguments(column,
-                               args,
-                               weight_collections,
-                               trainable,
-                               output_rank=2):
-  """Returns embeddings for a column based on the computed arguments.
-
-  Args:
-   column: the column name.
-   args: the _DeepEmbeddingLookupArguments for this column.
-   weight_collections: collections to store weights in.
-   trainable: whether these embeddings should be trainable.
-   output_rank: the desired rank of the returned `Tensor`. Inner dimensions will
-     be combined to produce the desired rank.
-
-  Returns:
-   the embeddings.
-
-  Raises:
-   ValueError: if not possible to create.
-  """
-  # pylint: disable=protected-access
-  input_tensor = layers._inner_flatten(args.input_tensor, output_rank)
-  weight_tensor = None
-  if args.weight_tensor is not None:
-    weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank)
-  # pylint: enable=protected-access
-
-  # This option is only enabled for scattered_embedding_column.
-  if args.hash_key:
-    embeddings = contrib_variables.model_variable(
-        name='weights',
-        shape=[args.vocab_size],
-        dtype=dtypes.float32,
-        initializer=args.initializer,
-        trainable=(trainable and args.trainable),
-        collections=weight_collections)
-
-    return embedding_ops.scattered_embedding_lookup_sparse(
-        embeddings, input_tensor, args.dimension,
-        hash_key=args.hash_key,
-        combiner=args.combiner, name='lookup')
-
-  if args.shared_embedding_name is not None:
-    shared_embedding_collection_name = (
-        'SHARED_EMBEDDING_COLLECTION_' + args.shared_embedding_name.upper())
-    graph = ops.get_default_graph()
-    shared_embedding_collection = (
-        graph.get_collection_ref(shared_embedding_collection_name))
-    shape = [args.vocab_size, args.dimension]
-    if shared_embedding_collection:
-      if len(shared_embedding_collection) > 1:
-        raise ValueError('Collection %s can only contain one '
-                         '(partitioned) variable.'
-                         % shared_embedding_collection_name)
-      else:
-        embeddings = shared_embedding_collection[0]
-        if embeddings.get_shape() != shape:
-          raise ValueError('The embedding variable with name {} already '
-                           'exists, but its shape does not match required '
-                           'embedding shape  here. Please make sure to use '
-                           'different shared_embedding_name for different '
-                           'shared embeddings.'.format(
-                               args.shared_embedding_name))
-    else:
-      embeddings = contrib_variables.model_variable(
-          name=args.shared_embedding_name,
-          shape=shape,
-          dtype=dtypes.float32,
-          initializer=args.initializer,
-          trainable=(trainable and args.trainable),
-          collections=weight_collections)
-      graph.add_to_collection(shared_embedding_collection_name, embeddings)
-  else:
-    embeddings = contrib_variables.model_variable(
-        name='weights',
-        shape=[args.vocab_size, args.dimension],
-        dtype=dtypes.float32,
-        initializer=args.initializer,
-        trainable=(trainable and args.trainable),
-        collections=weight_collections)
-
-  if _is_variable(embeddings):
-    embeddings = [embeddings]
-  else:
-    embeddings = embeddings._get_variable_list()  # pylint: disable=protected-access
-  # pylint: disable=protected-access
-  _maybe_restore_from_checkpoint(
-      column._checkpoint_path(), embeddings)
-  return embedding_ops.safe_embedding_lookup_sparse(
-      embeddings,
-      input_tensor,
-      sparse_weights=weight_tensor,
-      combiner=args.combiner,
-      name=column.name + 'weights',
-      max_norm=args.max_norm)
-
-
 def _maybe_reshape_input_tensor(tensor, column_name, output_rank):
   """Reshape the input tensor by the following rule.
 
@@ -232,12 +125,13 @@ def _input_from_feature_columns(columns_to_tensors,
           # pylint: disable=protected-access
           arguments = column._deep_embedding_lookup_arguments(
               transformed_tensor)
-          output_tensors.append(_embeddings_from_arguments(
-              column,
-              arguments,
-              weight_collections,
-              trainable,
-              output_rank=output_rank))
+          output_tensors.append(
+              fc._embeddings_from_arguments(  # pylint: disable=protected-access
+                  column,
+                  arguments,
+                  weight_collections,
+                  trainable,
+                  output_rank=output_rank))
 
         except NotImplementedError as ee:
           try:
@@ -393,7 +287,7 @@ def _create_embedding_lookup(column,
         initializer=embedding_lookup_arguments.initializer,
         trainable=trainable,
         collections=weight_collections)
-    if _is_variable(variable):
+    if fc._is_variable(variable):  # pylint: disable=protected-access
       variable = [variable]
     else:
       variable = variable._get_variable_list()  # pylint: disable=protected-access
@@ -406,16 +300,6 @@ def _create_embedding_lookup(column,
     return variable, predictions
 
 
-def _maybe_restore_from_checkpoint(checkpoint_path, variable):
-  if checkpoint_path is not None:
-    path, tensor_name = checkpoint_path
-    weights_to_restore = variable
-    if len(variable) == 1:
-      weights_to_restore = variable[0]
-    checkpoint_utils.init_from_checkpoint(path,
-                                          {tensor_name: weights_to_restore})
-
-
 def _create_joint_embedding_lookup(columns_to_tensors,
                                    embedding_lookup_arguments,
                                    num_outputs,
@@ -451,7 +335,7 @@ def _create_joint_embedding_lookup(columns_to_tensors,
         initializer=init_ops.zeros_initializer(),
         trainable=trainable,
         collections=weight_collections)
-    if _is_variable(variable):
+    if fc._is_variable(variable):  # pylint: disable=protected-access
       variable = [variable]
     else:
       variable = variable._get_variable_list()  # pylint: disable=protected-access
@@ -634,7 +518,7 @@ def weighted_sum_from_feature_columns(columns_to_tensors,
           predictions, shape=(-1, num_outputs)))
       column_to_variable[column] = variable
       _log_variable(variable)
-      _maybe_restore_from_checkpoint(column._checkpoint_path(), variable)
+      fc._maybe_restore_from_checkpoint(column._checkpoint_path(), variable)  # pylint: disable=protected-access
     # pylint: enable=protected-access
     predictions_no_bias = math_ops.add_n(output_tensors)
     bias = contrib_variables.model_variable(
@@ -827,10 +711,10 @@ def parse_feature_columns_from_sequence_examples(
 def _log_variable(variable):
   if isinstance(variable, list):
     for var in variable:
-      if _is_variable(variable):
+      if fc._is_variable(variable):  # pylint: disable=protected-access
         logging.info('Created variable %s, with device=%s', var.name,
                      var.device)
-  elif _is_variable(variable):
+  elif fc._is_variable(variable):  # pylint: disable=protected-access
     logging.info('Created variable %s, with device=%s', variable.name,
                  variable.device)
 
@@ -972,7 +856,8 @@ def _add_variable_collection(weight_collections):
 # pylint: disable=protected-access
 _SUPPORTED_SEQUENCE_COLUMNS = (fc._OneHotColumn,
                                fc._EmbeddingColumn,
-                               fc._RealValuedColumn)
+                               fc._RealValuedColumn,
+                               fc._RealValuedVarLenColumn)
 
 _FORBIDDEN_SEQUENCE_COLUMNS = (fc._ScatteredEmbeddingColumn,
                                fc._BucketizedColumn,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 632836fee440b53e78b2728abce6e971a9273925..797a7c11dbf9f3d044687e93ba9f9fe4df3a8357 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -27,14 +27,15 @@ from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -223,7 +224,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(keys_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[keys_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[keys_sparse].values.eval(), [1, 2, 0])
       self.assertAllEqual(output[keys_sparse].indices.eval(),
@@ -241,7 +242,7 @@ class TransformerTest(test.TestCase):
     output = feature_column_ops._Transformer(features).transform(keys_sparse)
 
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       # While the input is a dense Tensor, the output should be a SparseTensor.
       self.assertIsInstance(output, sparse_tensor.SparseTensor)
       self.assertEqual(output.dtype, dtypes.int64)
@@ -310,7 +311,7 @@ class TransformerTest(test.TestCase):
     self.assertIn(weighted_ids, output)
 
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output[weighted_ids][0].dense_shape.eval(),
                           ids_tensor.dense_shape.eval())
       self.assertAllEqual(output[weighted_ids][0].indices.eval(),
@@ -340,7 +341,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -362,7 +363,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0, 1])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -386,7 +387,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -408,7 +409,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0, 1])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -596,12 +597,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
         "income":
             constant_op.constant([[20.3, 10], [110.3, 0.4], [-3.0, 30.4]]),
     }
-    output = feature_column_ops.input_from_feature_columns(features, [
-        one_hot_column, embedding_column, real_valued_column])
+    columns = [one_hot_column, embedding_column, real_valued_column]
+    output = feature_column_ops.input_from_feature_columns(features, columns)
+    output_core = fc_core.input_layer(features, columns)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output.eval().shape, [3, 2 + 4 + 10])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
   def testRealValuedColumn(self):
     real_valued = feature_column.real_valued_column("price")
@@ -610,6 +614,9 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
   def testRealValuedColumnWithMultiDimensions(self):
     real_valued = feature_column.real_valued_column("price", 2)
@@ -620,17 +627,31 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
-  def testRealValuedColumnSparse(self):
-    sparse_real_valued = feature_column._real_valued_var_len_column(
+  def testRealValuedColumnDense(self):
+    var_len_real_valued = feature_column._real_valued_var_len_column(
         "rating", default_value=-1)
-    rating = [[2.0], [-1.0], [5.0]]
+    rating = np.array([[0., 1., 2., -1.],
+                       [3., 4., 5., 6.]])
     features = {"rating": constant_op.constant(rating)}
-    with self.assertRaisesRegexp(
-        ValueError,
-        "Error creating input layer for column: rating.*"):
-      feature_column_ops.input_from_feature_columns(features,
-                                                    [sparse_real_valued])
+    with self.test_session() as sess:
+      output = sess.run(feature_column_ops.input_from_feature_columns(
+          features, [var_len_real_valued]))
+    self.assertAllClose(rating, output)
+
+  def testRealValuedColumnTypeConversion(self):
+    var_len_real_valued = feature_column._real_valued_var_len_column(
+        "rating", default_value=-1)
+    rating = np.array([[0, 1, 2, -1],
+                       [3, 4, 5, 6]])
+    features = {"rating": constant_op.constant(rating, dtype=dtypes.int64)}
+    with self.test_session() as sess:
+      output = sess.run(feature_column_ops.input_from_feature_columns(
+          features, [var_len_real_valued]))
+    self.assertAllClose(rating.astype(np.float32), output)
 
   def testRealValuedColumnWithNormalizer(self):
     real_valued = feature_column.real_valued_column(
@@ -640,6 +661,9 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval() - 2)
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
   def testRealValuedColumnWithMultiDimensionsAndNormalizer(self):
     real_valued = feature_column.real_valued_column(
@@ -651,6 +675,9 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval() - 2)
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
   def testBucketizedColumnWithNormalizerSucceedsForDNN(self):
     bucket = feature_column.bucketized_column(
@@ -663,6 +690,8 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     expected = [[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
     with self.test_session():
       self.assertAllClose(output.eval(), expected)
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [bucket]).eval())
 
   def testBucketizedColumnWithMultiDimensionsSucceedsForDNN(self):
     bucket = feature_column.bucketized_column(
@@ -677,6 +706,8 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                 [1, 0, 0, 0, 1, 0, 0, 0]]
     with self.test_session():
       self.assertAllClose(output.eval(), expected)
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [bucket]).eval())
 
   def testOneHotColumnFromWeightedSparseColumnSucceedsForDNN(self):
     ids_column = feature_column.sparse_column_with_keys(
@@ -695,11 +726,14 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     one_hot_column = feature_column.one_hot_column(weighted_ids_column)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_column])
+    output_core = fc_core.input_layer(features, [one_hot_column])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([[0, 0, 10., 0], [0, 20., 0, 0], [30., 0, 40., 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromSparseColumnWithKeysSucceedsForDNN(self):
     ids_column = feature_column.sparse_column_with_keys(
@@ -712,12 +746,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     features = {"ids": ids_tensor}
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromMultivalentSparseColumnWithKeysSucceedsForDNN(self):
     ids_column = feature_column.sparse_column_with_keys(
@@ -730,12 +767,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     features = {"ids": ids_tensor}
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 1, 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromSparseColumnWithIntegerizedFeaturePassesForDNN(self):
     ids_column = feature_column.sparse_column_with_integerized_feature(
@@ -750,10 +790,13 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     }
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual([[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 1, 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromSparseColumnWithHashBucketSucceedsForDNN(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("feat", 10)
@@ -765,10 +808,13 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     one_hot_sparse = feature_column.one_hot_column(hashed_sparse)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([3, 10], output.eval().shape)
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testEmbeddingColumnSucceedsForDNN(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -780,9 +826,12 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     embeded_sparse = feature_column.embedding_column(hashed_sparse, 10)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [embeded_sparse])
+    output_core = fc_core.input_layer(features, [embeded_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(output.eval().shape, [4, 10])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
   def testScatteredEmbeddingColumnSucceedsForDNN(self):
     wire_tensor = sparse_tensor.SparseTensor(
@@ -798,14 +847,24 @@ class CreateInputLayersForDNNsTest(test.TestCase):
         features, [embedded_sparse], weight_collections=["my_collection"])
     weights = ops.get_collection("my_collection")
     grad = gradients_impl.gradients(output, weights)
+    # Calcuates the tensors calculated by FC core libs. Later, the values will
+    # be compared with the contrib version.
+    output_core = fc_core.input_layer(
+        features, [embedded_sparse], weight_collections=["my_collection_core"])
+    weights_core = ops.get_collection("my_collection_core")
+    grad_core = gradients_impl.gradients(output_core, weights_core)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       gradient_values = []
+      gradient_values_core = []
       # Collect the gradient from the different partitions (one in this test)
       for p in range(len(grad)):
         gradient_values.extend(grad[p].values.eval())
+        gradient_values_core.extend(grad_core[p].values.eval())
       gradient_values.sort()
+      gradient_values_core.sort()
       self.assertAllEqual(gradient_values, [0.5] * 6 + [2] * 3)
+      self.assertAllEqual(gradient_values, gradient_values_core)
 
   def testEmbeddingColumnWithInitializerSucceedsForDNN(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -821,12 +880,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
         initializer=init_ops.constant_initializer(init_value))
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [embeded_sparse])
+    output_core = fc_core.input_layer(features, [embeded_sparse])
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       output_eval = output.eval()
       self.assertAllEqual(output_eval.shape, [2, 10])
       self.assertAllClose(output_eval, np.tile(init_value, [2, 10]))
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testEmbeddingColumnWithMultipleInitializersFails(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -872,10 +934,14 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     embeded_sparse = feature_column.embedding_column(weighted_ids, 10)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [embeded_sparse])
+    output_core = fc_core.input_layer(features, [embeded_sparse])
+
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output.eval().shape, [2, 10])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
   def testEmbeddingColumnWithIntegerWeightedSparseColumnSucceedsForDNN(self):
     """Same as the previous test, but with integer weights."""
@@ -897,7 +963,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [embeded_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output.eval().shape, [2, 10])
 
   def testEmbeddingColumnWithCrossedColumnSucceedsForDNN(self):
@@ -948,7 +1014,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           "Error creating input layer for column: ids_weighted_by_weights"):
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         feature_column_ops.input_from_feature_columns(features, [weighted_ids])
 
   def testCrossedColumnFailsForDNN(self):
@@ -1055,7 +1121,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [embeded_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       # score: (sum of weights)
       self.assertAllEqual(output.eval(), [[10.], [50.], [0.]])
 
@@ -1208,6 +1274,19 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
       model_inputs = sess.run(model_input_tensor)
     self.assertAllClose(measurement_input, model_inputs)
 
+  def testRealValuedVarLenColumn(self):
+    var_len_real_valued = feature_column._real_valued_var_len_column(
+        "rating", default_value=-1)
+    rating = np.array([[0., 1., 2., -1.],
+                       [3., 4., 5., 6.]])
+    features = {"rating": constant_op.constant(rating)}
+    with self.test_session() as sess:
+      output = sess.run(
+          feature_column_ops.sequence_input_from_feature_columns(
+              features, [var_len_real_valued]))
+    reshaped_rating = np.reshape(rating, [2, 4, 1])
+    self.assertAllClose(reshaped_rating, output)
+
   def testRealValuedColumnWithExtraDimensions(self):
     batch_size = 4
     sequence_length = 8
@@ -1293,7 +1372,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     expected_input_shape = np.array([4, 3, 4])
@@ -1327,7 +1406,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     expected_input_shape = np.array([4, 3, hash_buckets])
@@ -1357,7 +1436,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     self.assertAllEqual(expected_input_shape, model_input.shape)
@@ -1386,7 +1465,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     self.assertAllEqual(expected_input_shape, model_input.shape)
@@ -1416,14 +1495,14 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
                                                embedding_weights)
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input, gradients = sess.run([model_input_tensor, gradient_tensor])
 
     expected_input_shape = [4, 3, embedding_dimension]
     self.assertAllEqual(expected_input_shape, model_input.shape)
 
-    # `ids_tensor` consists of 7 instances of <empty>, 3 occurences of "b",
-    # 2 occurences of "c" and 1 instance of "a".
+    # `ids_tensor` consists of 7 instances of <empty>, 3 occurrences of "b",
+    # 2 occurrences of "c" and 1 instance of "a".
     expected_gradient_values = sorted([0., 3., 2., 1.] * embedding_dimension)
     actual_gradient_values = np.sort(gradients[0].values, axis=None)
     self.assertAllClose(expected_gradient_values, actual_gradient_values)
@@ -1483,7 +1562,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     expected_input_shape = [
@@ -1517,9 +1596,12 @@ class WeightedSumTest(test.TestCase):
     features = {"wire": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [hashed_sparse], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testSparseIntColumn(self):
     """Tests a sparse column with int values."""
@@ -1532,9 +1614,12 @@ class WeightedSumTest(test.TestCase):
     features = {"wire": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [hashed_sparse], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testSparseColumnWithDenseInputTensor(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -1543,9 +1628,12 @@ class WeightedSumTest(test.TestCase):
     features = {"wire": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [hashed_sparse], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testWeightedSparseColumn(self):
     ids = feature_column.sparse_column_with_keys("ids",
@@ -1562,10 +1650,13 @@ class WeightedSumTest(test.TestCase):
     features = {"ids": ids_tensor, "weights": weights_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [weighted_ids], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [weighted_ids], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testWeightedSparseColumnWithDenseInputTensor(self):
     ids = feature_column.sparse_column_with_keys(
@@ -1577,11 +1668,14 @@ class WeightedSumTest(test.TestCase):
     features = {"ids": ids_tensor, "weights": weights_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [weighted_ids], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [weighted_ids], units=5)
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testCrossedColumn(self):
     a = feature_column.sparse_column_with_hash_bucket(
@@ -1596,9 +1690,12 @@ class WeightedSumTest(test.TestCase):
     features = {"aaa": wire_tensor, "bbb": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [crossed], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [crossed], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testEmbeddingColumn(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -1632,9 +1729,11 @@ class WeightedSumTest(test.TestCase):
       output, column_to_variable, _ = (
           feature_column_ops.weighted_sum_from_feature_columns(
               features, [movies], num_outputs=1))
+      logits_core = fc_core.linear_model(features, [movies])
+
       with self.test_session() as sess:
         variables_lib.initialize_all_variables().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[movies][0]
         self.assertEqual(weights.get_shape(), (3, 1))
@@ -1642,6 +1741,8 @@ class WeightedSumTest(test.TestCase):
         # score for first example = 0.3 (matrix) + 0.1 (head-on) = 0.4
         # score for second example = 0.5 (winter sleep)
         self.assertAllClose(output.eval(), [[0.4], [0.5]])
+        # Cross compatibility: Core builder output should equal to contrib.
+        self.assertAllEqual(output.eval().shape, logits_core.eval().shape)
 
   def testRealValuedColumnWithMultiDimensions(self):
     real_valued = feature_column.real_valued_column("price", 2)
@@ -1686,9 +1787,13 @@ class WeightedSumTest(test.TestCase):
     }
     output, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [real_valued, bucket, hashed_sparse, crossed], num_outputs=5)
+    output_core = fc_core.linear_model(
+        features, [real_valued, bucket, hashed_sparse, crossed], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(output.eval().shape, [3, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testPredictions(self):
     language = feature_column.sparse_column_with_keys(
@@ -1709,7 +1814,7 @@ class WeightedSumTest(test.TestCase):
               features, [age, language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertAllClose(output.eval(), [[0.], [0.]])
 
@@ -1749,7 +1854,7 @@ class WeightedSumTest(test.TestCase):
       self.assertEqual(len(variables), 1)
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertAllClose(output.eval(), [[0.], [0.]])
 
@@ -1813,7 +1918,7 @@ class WeightedSumTest(test.TestCase):
               features, [weighted_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertAllClose(output.eval(), [[0.], [0.]])
 
@@ -1841,7 +1946,7 @@ class WeightedSumTest(test.TestCase):
               features, [language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         # score: 0.1 + language_weight['hindi'] + language_weight['english']
         sess.run(bias.assign([0.1]))
@@ -1864,7 +1969,7 @@ class WeightedSumTest(test.TestCase):
               features, [movies], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[movies][0]
         self.assertEqual(weights.get_shape(), (15, 1))
@@ -1898,7 +2003,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_language][0]
         sess.run(weights.assign(weights + 0.4))
@@ -1922,7 +2027,7 @@ class WeightedSumTest(test.TestCase):
               features, [language_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[language_language][0]
         sess.run(weights.assign(weights + 0.4))
@@ -1955,7 +2060,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_language][0]
         sess.run(weights.assign(weights + 0.4))
@@ -1996,7 +2101,7 @@ class WeightedSumTest(test.TestCase):
                 scope=scope))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertEqual(2, len(column_to_variable[country]))
         self.assertEqual(3, len(column_to_variable[language]))
@@ -2033,7 +2138,7 @@ class WeightedSumTest(test.TestCase):
               features, [country, age, incomes], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         incomes_weights = column_to_variable[incomes][0]
         sess.run(incomes_weights.assign([[0.1], [0.2], [0.3]]))
@@ -2069,7 +2174,7 @@ class WeightedSumTest(test.TestCase):
               features, [country, age, height, incomes], num_outputs=5))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         height_weights = column_to_variable[height][0]
         sess.run(
@@ -2097,9 +2202,12 @@ class WeightedSumTest(test.TestCase):
       output, column_to_variable, _ = (
           feature_column_ops.weighted_sum_from_feature_columns(
               features, [bucket], num_outputs=1))
+      output_core = fc_core.linear_model(features, [bucket])
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
+        # Cross compatibility: Core builder output should equal to contrib.
+        self.assertAllEqual(output.eval(), output_core.eval())
 
         sess.run(column_to_variable[bucket][0].assign([[0.1], [0.2], [0.3],
                                                        [0.4]]))
@@ -2125,9 +2233,12 @@ class WeightedSumTest(test.TestCase):
       output, column_to_variable, _ = (
           feature_column_ops.weighted_sum_from_feature_columns(
               features, [bucket, country], num_outputs=1))
+      output_core = fc_core.linear_model(features, [bucket, country])
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
+        # Cross compatibility: Core builder output should equal to contrib.
+        self.assertAllEqual(output.eval(), output_core.eval())
 
         # dimension = 2, bucket_size = 4, num_classes = 1
         sess.run(column_to_variable[bucket][0].assign(
@@ -2156,7 +2267,7 @@ class WeightedSumTest(test.TestCase):
               features, [bucket, country], num_outputs=5))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         # dimension = 2, bucket_size = 4, num_classes = 5
         sess.run(column_to_variable[bucket][0].assign(
@@ -2192,7 +2303,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_price], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_price][0]
         sess.run(weights.assign(weights + 0.4))
@@ -2231,7 +2342,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_language_price], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_language_price][0]
         sess.run(weights.assign(weights + 0.4))
@@ -2255,7 +2366,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.1], [0.5], [0.3]])
@@ -2270,7 +2381,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.1], [0.5], [0.3]])
@@ -2285,7 +2396,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.6], [0.7]])
@@ -2306,7 +2417,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.1], [0.5], [0.3]])
@@ -2318,7 +2429,7 @@ class WeightedSumTest(test.TestCase):
           features, [feature_column.real_valued_column("age")], num_outputs=3)
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         sess.run(bias.assign([0.1, 0.2, 0.3]))
         self.assertAllClose(output.eval(), [[0.1, 0.2, 0.3], [0.1, 0.2, 0.3],
                                             [0.1, 0.2, 0.3], [0.1, 0.2, 0.3]])
@@ -2332,7 +2443,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (1, 3))
         sess.run(weights.assign([[0.01, 0.03, 0.05]]))
@@ -2356,7 +2467,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
         sess.run(
@@ -2382,7 +2493,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
@@ -2422,7 +2533,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
@@ -2451,7 +2562,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
@@ -2516,7 +2627,7 @@ class ParseExampleTest(test.TestCase):
     self.assertIn(bucket, output)
     self.assertIn(wire_cast, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output[bucket].eval(), [[2, 3, 0]])
       self.assertAllEqual(output[wire_cast].indices.eval(), [[0, 0], [0, 1]])
       self.assertAllEqual(output[wire_cast].values.eval(), [2, 0])
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index aa3912a4088e4991eac9bcc84d70162136a823f8..b6a8b6bdda390bc685352021b6a881457cd5740c 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -173,7 +173,7 @@ class FeatureColumnTest(test.TestCase):
     for i in range(len(b1_value)):
       self.assertAllClose(b1_value[i], b2_value[i])
 
-    # Test the case when a shared_embedding_name is explictly specified.
+    # Test the case when a shared_embedding_name is explicitly specified.
     d = fc.shared_embedding_columns(
         [a1, a2],
         dimension=4,
@@ -441,7 +441,7 @@ class FeatureColumnTest(test.TestCase):
     sparse_tensor = sparse_tensor_lib.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
     with self.assertRaisesRegexp(
-        ValueError, "Calling an abstract method."):
+        ValueError, "Set is_sparse to False"):
       real_valued_column._to_dnn_input_layer(sparse_tensor)
 
   def testRealValuedColumnDeepCopy(self):
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index 4359d0c63e3b743f926f4e6cf231e5b9c69becc2..271b3c01ffc86aeb031ec2737c96b926e6d16697 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -34,9 +34,10 @@ def xavier_initializer(uniform=True, seed=None, dtype=dtypes.float32):
   This function implements the weight initialization from:
 
   Xavier Glorot and Yoshua Bengio (2010):
-           Understanding the difficulty of training deep feedforward neural
+           [Understanding the difficulty of training deep feedforward neural
            networks. International conference on artificial intelligence and
-           statistics.
+           statistics.](
+           http://www.jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf)
 
   This initializer is designed to keep the scale of the gradients roughly the
   same in all layers. In uniform distribution this ends up being the range:
@@ -46,8 +47,7 @@ def xavier_initializer(uniform=True, seed=None, dtype=dtypes.float32):
   Args:
     uniform: Whether to use uniform or normal distributed random initialization.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-      for behavior.
+          @{tf.set_random_seed} for behavior.
     dtype: The data type. Only floating point types are supported.
 
   Returns:
@@ -97,8 +97,7 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
     mode: String.  'FAN_IN', 'FAN_OUT', 'FAN_AVG'.
     uniform: Whether to use uniform or normal distributed random initialization.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-      for behavior.
+          @{tf.set_random_seed} for behavior.
     dtype: The data type. Only floating point types are supported.
 
   Returns:
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index c920764803d53255dcef21552d8724407fb55159..7a429f75bbf2abe3eeb6bc3b5ac53d2be7e845e4 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -278,7 +278,7 @@ def _fused_batch_norm(
         trainable=trainable_gamma)
 
     # Create moving_mean and moving_variance variables and add them to the
-    # appropiate collections.
+    # appropriate collections.
     moving_mean_collections = utils.get_variable_collections(
         variables_collections, 'moving_mean')
     moving_mean_initializer = param_initializers.get(
@@ -632,7 +632,7 @@ def batch_norm(inputs,
                                        trainable=trainable)
 
     # Create moving_mean and moving_variance variables and add them to the
-    # appropiate collections. We disable variable partitioning while creating
+    # appropriate collections. We disable variable partitioning while creating
     # them, because assign_moving_average is not yet supported for partitioned
     # variables.
     partitioner = variable_scope.get_variable_scope().partitioner
@@ -844,7 +844,7 @@ def convolution(inputs,
   variable would be created and added the activations. Finally, if
   `activation_fn` is not `None`, it is applied to the activations as well.
 
-  Performs a'trous convolution with input stride/dilation rate equal to `rate`
+  Performs atrous convolution with input stride/dilation rate equal to `rate`
   if a value > 1 for any dimension of `rate` is specified.  In this case
   `stride` values != 1 are not supported.
 
@@ -870,7 +870,7 @@ def convolution(inputs,
       "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
       For N=3, the valid values are "NDHWC" (default) and "NCDHW".
     rate: A sequence of N positive integers specifying the dilation rate to use
-      for a'trous convolution.  Can be a single integer to specify the same
+      for atrous convolution.  Can be a single integer to specify the same
       value for all spatial dimensions.  Specifying any `rate` value != 1 is
       incompatible with specifying any `stride` value != 1.
     activation_fn: Activation function. The default value is a ReLU function.
@@ -1087,7 +1087,7 @@ def convolution2d_transpose(
   """Adds a convolution2d_transpose with an optional batch normalization layer.
 
   The function creates a variable called `weights`, representing the
-  kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
+  kernel, that is convolved with the input. If `normalizer_fn` is `None`, a
   second variable called 'biases' is added to the result of the operation.
 
   Args:
@@ -1847,9 +1847,9 @@ def separable_convolution2d(
   This op first performs a depthwise convolution that acts separately on
   channels, creating a variable called `depthwise_weights`. If `num_outputs`
   is not None, it adds a pointwise convolution that mixes channels, creating a
-  variable called `pointwise_weights`. Then, if `batch_norm_params` is None,
-  it adds bias to the result, creating a variable called 'biases', otherwise
-  it adds a batch normalization layer. It finally applies an activation function
+  variable called `pointwise_weights`. Then, if `normalizer_fn` is None,
+  it adds bias to the result, creating a variable called 'biases', otherwise,
+  the `normalizer_fn` is applied. It finally applies an activation function
   to produce the end result.
 
   Args:
@@ -1865,7 +1865,7 @@ def separable_convolution2d(
       depthwise convolution stride. Can be an int if both strides are the same.
     padding: One of 'VALID' or 'SAME'.
     rate: A list of length 2: [rate_height, rate_width], specifying the dilation
-      rates for a'trous convolution. Can be an int if both rates are the same.
+      rates for atrous convolution. Can be an int if both rates are the same.
       If any value is larger than one, then both stride values need to be one.
     activation_fn: Activation function. The default value is a ReLU function.
       Explicitly set it to None to skip it and maintain a linear activation.
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index b48ad09e14011e5fe77764ac365aec276d5bb7c4..60700c5a657f51afe06232ebb49b86eebd97424f 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -247,7 +247,7 @@ class ConvolutionTest(test.TestCase):
   def testCreateConv(self):
     height, width = 7, 9
     with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 4))
+      images = np.random.uniform(size=(5, height, width, 4)).astype(np.float32)
       output = layers_lib.convolution2d(images, 32, [3, 3])
       self.assertEqual(output.op.name, 'Conv/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
@@ -259,7 +259,7 @@ class ConvolutionTest(test.TestCase):
   def testCreateConvNCHW(self):
     height, width = 7, 9
     with self.test_session():
-      images = np.random.uniform(size=(5, 4, height, width))
+      images = np.random.uniform(size=(5, 4, height, width)).astype(np.float32)
       output = layers_lib.convolution2d(images, 32, [3, 3], data_format='NCHW')
       self.assertEqual(output.op.name, 'Conv/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 32, height, width])
@@ -2780,7 +2780,7 @@ class RepeatTests(test.TestCase):
   def testRepeat(self):
     height, width = 3, 3
     with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 3))
+      images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
       self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
@@ -2815,15 +2815,6 @@ class SeparableConv2dTest(test.TestCase):
       self.assertEqual(output.op.name, 'SeparableConv2d/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
 
-  def testCreateConvFloat64(self):
-    height, width = 3, 3
-    with self.test_session():
-      images = random_ops.random_uniform(
-          (5, height, width, 3), seed=1, dtype=dtypes.float64)
-      output = layers_lib.separable_conv2d(images, 32, [3, 3], 2)
-      self.assertEqual(output.op.name, 'SeparableConv2d/Relu')
-      self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
-
   def testCreateDepthwiseConv(self):
     height, width = 3, 3
     with self.test_session():
diff --git a/tensorflow/contrib/layers/python/ops/bucketization_op.py b/tensorflow/contrib/layers/python/ops/bucketization_op.py
index b941a9b82227e40fb14400daa8bd7b34bed982b4..f498352855f656666e66a889d8db274cec755028 100644
--- a/tensorflow/contrib/layers/python/ops/bucketization_op.py
+++ b/tensorflow/contrib/layers/python/ops/bucketization_op.py
@@ -17,13 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.layers.ops import gen_bucketization_op
-from tensorflow.contrib.util import loader
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
-
-_bucketization_op = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_bucketization_op.so"))
+from tensorflow.python.ops import math_ops
 
 
 def bucketize(input_tensor, boundaries, name=None):
@@ -43,10 +37,5 @@ def bucketize(input_tensor, boundaries, name=None):
   Raises:
     TypeError: If boundaries is not a list.
   """
-  if not isinstance(boundaries, list):
-    raise TypeError("boundaries must be a list")
-
-  return gen_bucketization_op.bucketize(input_tensor, boundaries, name=name)
-
-
-ops.NotDifferentiable("Bucketize")
+  return math_ops._bucketize(  # pylint: disable=protected-access
+      input_tensor, boundaries=boundaries, name=name)
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 148f2708c8527929f06c6769aa0595979b166450..f313c461439db490830a8a5e52e73f9339865a97 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -68,6 +68,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:inputs",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
@@ -1031,10 +1032,12 @@ py_test(
 py_test(
     name = "export_test",
     size = "small",
+    timeout = "moderate",
     srcs = ["python/learn/utils/export_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "manual",  # http://b/31032996
+        "notap",  # TODO(b/37950026): Test is flaky
     ],
     deps = [
         ":learn",
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index 05c4024d0b917d098675cd96e0400594b3817e89..aec4911e293a79136095dfccd0851a1a99da6046 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -88,9 +88,11 @@ from __future__ import print_function
 from tensorflow.contrib.learn.python.learn import *
 # pylint: enable=wildcard-import
 
+from tensorflow.contrib.learn.python.learn import learn_runner
+
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['datasets', 'head', 'io', 'models',
+_allowed_symbols = ['datasets', 'head', 'io', 'learn_runner', 'models',
                     'monitors', 'NotFittedError', 'ops', 'preprocessing',
                     'utils', 'graph_actions']
 
diff --git a/tensorflow/contrib/learn/python/learn/README.md b/tensorflow/contrib/learn/python/learn/README.md
deleted file mode 100644
index d0145f5439395d3f4c704e14f2bad67502e01300..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ /dev/null
@@ -1,247 +0,0 @@
-# TF Learn
-
-TF Learn is a simplified interface for TensorFlow, to get people started on predictive analytics and data mining. The library covers a variety of needs: from linear models to *Deep Learning* applications like text and image understanding.
-
-### Why *TensorFlow*?
-
-* TensorFlow provides a good backbone for building different shapes of machine learning applications.
-* It will continue to evolve both in the distributed direction and as general pipelining machinery.
-
-### Why *TensorFlow Learn*?
-
-- To smooth the transition from the [scikit-learn](http://scikit-learn.org/stable/) world of one-liner machine learning into the more open world of building different shapes of ML models. You can start by using [fit](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#fit)/[predict](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#predict) and slide into TensorFlow APIs as you are getting comfortable.
-- To provide a set of reference models that will be easy to integrate with existing code.
-
-## Installation
-
-[Install TensorFlow](https://www.tensorflow.org/install/), and then simply import `learn` via `from tensorflow.contrib.learn` or use `tf.contrib.learn`.
-
-Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [pandas](http://pandas.pydata.org/) for additional functionality.
-
-### Tutorials
-
--   [TF Learn Quickstart](https://www.tensorflow.org/get_started/tflearn). Build,
-    train, and evaluate a neural network with just a few lines of code.
--   [Input Functions](https://www.tensorflow.org/get_started/input_fn). Learn how
-    to create input functions to feed data into your models.
--   [Linear Model](https://www.tensorflow.org/tutorials/wide). Learn the basics
-    of building linear models.
--   [Wide and Deep Learning](https://www.tensorflow.org/tutorials/wide_and_deep).
-    Jointly train a linear model and a deep neural network.
--   [Logging and Monitoring](https://www.tensorflow.org/get_started/monitors).
-    Use the Monitor API to audit training of a neural network.
--   [Custom Estimators](https://www.tensorflow.org/extend/estimators). Learn
-    how to create a custom estimator.
--   More coming soon.
-
-### Community
-
-- Twitter [#tensorflow](https://twitter.com/search?q=tensorflow&src=typd).
-- StackOverflow with [tensorflow tag](http://stackoverflow.com/questions/tagged/tensorflow) for questions and struggles.
-- GitHub [issues](https://github.com/tensorflow/tensorflow/issues) for technical discussions and feature requests.
-
-### Existing Estimator Implementations
-
--   [`LinearClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearClassifier))
--   [`LinearRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearRegressor))
--   [`DNNClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNClassifier))
--   [`DNNRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNRegressor))
--   [`DNNLinearCombinedClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedClassifier))
--   [`DNNLinearCombinedRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedRegressor))
--   [`SVM`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/svm.py)
-    ([docs](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.md))
--   [`GMM`](https://www.tensorflow.org/code/tensorflow/contrib/factorization/python/ops/gmm.py)
-    ([docs](https://www.tensorflow.org/code/tensorflow/contrib/factorization/g3doc/gmm.md))
--   [`KMeansClustering`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/kmeans.py)
-    ([docs](https://www.tensorflow.org/code/tensorflow/contrib/factorization/g3doc/kmeans.md))
-
-### Usage Examples
-
-Below are a few simple examples of the API. For more examples, please see [examples](https://www.tensorflow.org/code/tensorflow/examples/learn).
-
-General tips:
-
--  It's useful to rescale a dataset to 0 mean and unit standard deviation before passing it to an [`Estimator`](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator). [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) doesn't always do the right thing when variable are at very different scales.
-
--  Categorical variables should be managed before passing input to the estimator.
-
-## Linear Classifier
-
-Simple linear classification:
-
-```python
-import tensorflow.contrib.learn.python.learn as learn
-from sklearn import datasets, metrics
-
-iris = datasets.load_iris()
-feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
-classifier = learn.LinearClassifier(n_classes=3, feature_columns=feature_columns)
-classifier.fit(iris.data, iris.target, steps=200, batch_size=32)
-iris_predictions = list(classifier.predict(iris.data, as_iterable=True))
-score = metrics.accuracy_score(iris.target, iris_predictions)
-print("Accuracy: %f" % score)
-```
-
-## Linear Regressor
-
-Simple linear regression:
-
-```python
-import tensorflow.contrib.learn.python.learn as learn
-from sklearn import datasets, metrics, preprocessing
-
-boston = datasets.load_boston()
-x = preprocessing.StandardScaler().fit_transform(boston.data)
-feature_columns = learn.infer_real_valued_columns_from_input(x)
-regressor = learn.LinearRegressor(feature_columns=feature_columns)
-regressor.fit(x, boston.target, steps=200, batch_size=32)
-boston_predictions = list(regressor.predict(x, as_iterable=True))
-score = metrics.mean_squared_error(boston_predictions, boston.target)
-print ("MSE: %f" % score)
-```
-
-## Deep Neural Network
-
-Example of 3 layer network with 10, 20 and 10 hidden units respectively:
-
-```python
-import tensorflow.contrib.learn.python.learn as learn
-from sklearn import datasets, metrics
-
-iris = datasets.load_iris()
-feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
-classifier = learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=3, feature_columns=feature_columns)
-classifier.fit(iris.data, iris.target, steps=200, batch_size=32)
-iris_predictions = list(classifier.predict(iris.data, as_iterable=True))
-score = metrics.accuracy_score(iris.target, iris_predictions)
-print("Accuracy: %f" % score)
-```
-
-## Custom model
-
-Example of how to pass a custom model to the Estimator:
-
-```python
-from sklearn import datasets
-from sklearn import metrics
-import tensorflow as tf
-import tensorflow.contrib.layers.python.layers as layers
-import tensorflow.contrib.learn.python.learn as learn
-
-iris = datasets.load_iris()
-
-def my_model(features, labels):
-  """DNN with three hidden layers."""
-  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  labels = tf.one_hot(labels, 3, 1, 0)
-
-  # Create three fully connected layers respectively of size 10, 20, and 10.
-  features = layers.stack(features, layers.fully_connected, [10, 20, 10])
-
-  # Create two tensors respectively for prediction and loss.
-  prediction, loss = (
-      tf.contrib.learn.models.logistic_regression(features, labels)
-  )
-
-  # Create a tensor for training op.
-  train_op = tf.contrib.layers.optimize_loss(
-      loss, tf.contrib.framework.get_global_step(), optimizer='Adagrad',
-      learning_rate=0.1)
-
-  return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op
-
-classifier = learn.Estimator(model_fn=my_model)
-classifier.fit(iris.data, iris.target, steps=1000)
-
-y_predicted = [
-  p['class'] for p in classifier.predict(iris.data, as_iterable=True)]
-score = metrics.accuracy_score(iris.target, y_predicted)
-print('Accuracy: {0:f}'.format(score))
-```
-
-## Saving / Restoring models
-
-Each estimator supports a `model_dir` argument, which takes a folder path where all model information will be saved:
-
-```python
-classifier = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-```
-
-If you run multiple `fit` operations on the same `Estimator`, training will resume where the last operation left off, e.g.:
-
-<pre><strong>classifier = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-classifier.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 2.40115, step = 1
-INFO:tensorflow:Saving checkpoints for 1 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.338706, step = 101
-INFO:tensorflow:loss = 0.159414, step = 201
-INFO:tensorflow:Saving checkpoints for 300 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0953846.
-
-<strong>classifier.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 0.113173, step = 301
-INFO:tensorflow:Saving checkpoints for 301 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.175782, step = 401
-INFO:tensorflow:loss = 0.119735, step = 501
-INFO:tensorflow:Saving checkpoints for 600 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0518137.</pre>
-
-To restore checkpoints to a new `Estimator`, just pass it the same `model_dir` argument, e.g.:
-
-<pre><strong>classifier = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-classifier.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 1.16335, step = 1
-INFO:tensorflow:Saving checkpoints for 1 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.176995, step = 101
-INFO:tensorflow:loss = 0.184573, step = 201
-INFO:tensorflow:Saving checkpoints for 300 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0512496.
-
-<strong>classifier2 = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-classifier2.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 0.0543797, step = 301
-INFO:tensorflow:Saving checkpoints for 301 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.101036, step = 401
-INFO:tensorflow:loss = 0.137956, step = 501
-INFO:tensorflow:Saving checkpoints for 600 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0162506.</pre>
-
-## Summaries
-
-If you supply a `model_dir` argument to your `Estimator`s, TensorFlow will write summaries for ``loss`` and histograms for variables in this directory. (You can also add custom summaries in your custom model function by calling [Summary](https://www.tensorflow.org/api_guides/python/summary) operations.)
-
-To view the summaries in TensorBoard, run the following command, where `logdir` is the `model_dir` for your `Estimator`:
-
-```shell
-tensorboard --logdir=/tmp/tf_examples/my_model_1
-```
-
-and then load the reported URL.
-
-**Graph visualization**
-
-![Text classification RNN Graph](https://raw.githubusercontent.com/tensorflow/skflow/master/g3doc/images/text_classification_rnn_graph.png)
-
-**Loss visualization**
-
-![Text classification RNN Loss](https://raw.githubusercontent.com/tensorflow/skflow/master/g3doc/images/text_classification_rnn_loss.png)
-
-## More examples
-
-See the [examples folder](https://www.tensorflow.org/code/tensorflow/examples/learn) for:
-
--  An easy way to handle [categorical variables](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification.py) (words are just an example of a categorical variable)
--  Text Classification: see examples for [RNN](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_rnn.py) and [CNN](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_cnn.py) on characters
--  [Digit recognition using a CNN](https://www.tensorflow.org/code/tensorflow/examples/learn/mnist.py)
--  And much more!
diff --git a/tensorflow/contrib/learn/python/learn/estimators/__init__.py b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
index 118e974c6add1351f33a74943f6c534cd45ab59b..a40cbc0449071d86bd879c330677ec649605523d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
@@ -323,6 +323,7 @@ from tensorflow.contrib.learn.python.learn.estimators.metric_key import MetricKe
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModeKeys
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModelFnOps
 from tensorflow.contrib.learn.python.learn.estimators.prediction_key import PredictionKey
+from tensorflow.contrib.learn.python.learn.estimators.rnn_common import PredictionType
 from tensorflow.contrib.learn.python.learn.estimators.run_config import ClusterConfig
 from tensorflow.contrib.learn.python.learn.estimators.run_config import Environment
 from tensorflow.contrib.learn.python.learn.estimators.run_config import RunConfig
diff --git a/tensorflow/contrib/learn/python/learn/estimators/constants.py b/tensorflow/contrib/learn/python/learn/estimators/constants.py
index a62f1815b21d9cd85bc3cf057bf3be3c1958a797..fc69e810244a182b864be856e6720f8584f7aa65 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/constants.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/constants.py
@@ -38,3 +38,8 @@ class ProblemType(object):
   CLASSIFICATION = 1
   LINEAR_REGRESSION = 2
   LOGISTIC_REGRESSION = 3
+
+
+# CollectionDef key for the input feature keys.
+# TODO(b/34388557): This is a stopgap; please follow the bug to learn of changes
+COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS = "input_feature_keys"
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index 285ed4c186eca552c8e54cb0963eec6da9aa19a7..5e6288af99e1cdcd7b7f7bd7f51bddd41cb31c37 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -24,6 +24,7 @@ from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn.estimators import dnn_linear_combined
@@ -32,6 +33,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
@@ -125,11 +127,20 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
         "input_from_feature_columns",
         values=tuple(six.itervalues(features)),
         partitioner=input_layer_partitioner) as input_layer_scope:
-      net = layers.input_from_feature_columns(
-          columns_to_tensors=features,
-          feature_columns=feature_columns,
-          weight_collections=[parent_scope],
-          scope=input_layer_scope)
+      if all([
+          isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
+          for fc in feature_columns
+      ]):
+        net = layers.input_from_feature_columns(
+            columns_to_tensors=features,
+            feature_columns=feature_columns,
+            weight_collections=[parent_scope],
+            scope=input_layer_scope)
+      else:
+        net = fc_core.input_layer(
+            features=features,
+            feature_columns=feature_columns,
+            weight_collections=[parent_scope])
 
     for layer_id, num_hidden_units in enumerate(hidden_units):
       with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 789b3b39f30a0515c6f0f5c9b06e751ffcd28f8e..726612235050def6e7addb503cc6646a25de0e42 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -33,6 +33,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
@@ -102,9 +103,10 @@ def _linear_learning_rate(num_linear_feature_columns):
 def _add_hidden_layer_summary(value, tag):
   summary.scalar("%s/fraction_of_zero_values" % tag, nn.zero_fraction(value))
   summary.histogram("%s/activation" % tag, value)
+
+
 def _add_layer_summary(value, tag):
-  summary.scalar("%s/fraction_of_zero_values" % tag,
-                             nn.zero_fraction(value))
+  summary.scalar("%s/fraction_of_zero_values" % tag, nn.zero_fraction(value))
   summary.histogram("%s/activation" % tag, value)
 
 
@@ -229,11 +231,20 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
           "input_from_feature_columns",
           values=tuple(six.itervalues(features)),
           partitioner=input_layer_partitioner) as dnn_input_scope:
-        net = layers.input_from_feature_columns(
-            columns_to_tensors=features,
-            feature_columns=dnn_feature_columns,
-            weight_collections=[dnn_parent_scope],
-            scope=dnn_input_scope)
+        if all([
+            isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
+            for fc in dnn_feature_columns
+        ]):
+          net = layers.input_from_feature_columns(
+              columns_to_tensors=features,
+              feature_columns=dnn_feature_columns,
+              weight_collections=[dnn_parent_scope],
+              scope=dnn_input_scope)
+        else:
+          net = fc_core.input_layer(
+              features=features,
+              feature_columns=dnn_feature_columns,
+              weight_collections=[dnn_parent_scope])
 
       for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
         with variable_scope.variable_scope(
@@ -276,20 +287,29 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
         linear_parent_scope,
         values=tuple(six.itervalues(features)),
         partitioner=linear_partitioner) as scope:
-      if joint_linear_weights:
-        linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns(
-            columns_to_tensors=features,
-            feature_columns=linear_feature_columns,
-            num_outputs=head.logits_dimension,
-            weight_collections=[linear_parent_scope],
-            scope=scope)
+      if all([isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
+              for fc in linear_feature_columns]):
+        if joint_linear_weights:
+          linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns(
+              columns_to_tensors=features,
+              feature_columns=linear_feature_columns,
+              num_outputs=head.logits_dimension,
+              weight_collections=[linear_parent_scope],
+              scope=scope)
+        else:
+          linear_logits, _, _ = layers.weighted_sum_from_feature_columns(
+              columns_to_tensors=features,
+              feature_columns=linear_feature_columns,
+              num_outputs=head.logits_dimension,
+              weight_collections=[linear_parent_scope],
+              scope=scope)
       else:
-        linear_logits, _, _ = layers.weighted_sum_from_feature_columns(
-            columns_to_tensors=features,
+        linear_logits = fc_core.linear_model(
+            features=features,
             feature_columns=linear_feature_columns,
-            num_outputs=head.logits_dimension,
-            weight_collections=[linear_parent_scope],
-            scope=scope)
+            units=head.logits_dimension,
+            weight_collections=[linear_parent_scope])
+
       _add_layer_summary(linear_logits, scope.name)
 
   # Combine logits and build full model.
@@ -503,9 +523,36 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
     ...
   def input_fn_eval: # returns x, y (where y represents label's class index).
     ...
+  def input_fn_predict: # returns x, None.
+    ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
+  # predict_classes returns class indices.
+  estimator.predict_classes(input_fn=input_fn_predict)
+  ```
+
+  If the user specifies `label_keys` in constructor, labels must be strings from
+  the `label_keys` vocabulary. Example:
+
+  ```python
+  label_keys = ['label0', 'label1', 'label2']
+  estimator = DNNLinearCombinedClassifier(
+      n_classes=n_classes,
+      linear_feature_columns=[sparse_feature_a_x_sparse_feature_b],
+      dnn_feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      dnn_hidden_units=[1000, 500, 100],
+      label_keys=label_keys)
+
+  def input_fn_train: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.fit(input_fn=input_fn_train)
+
+  def input_fn_eval: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.evaluate(input_fn=input_fn_eval)
+  def input_fn_predict: # returns x, None
+  # predict_classes returns one of label_keys.
+  estimator.predict_classes(input_fn=input_fn_predict)
   ```
 
   Input of `fit` and `evaluate` should have following features,
@@ -545,6 +592,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
                feature_engineering_fn=None,
                embedding_lr_multipliers=None,
                input_layer_min_slice_size=None,
+               label_keys=None,
                fix_global_step_increment_bug=False):
     """Constructs a DNNLinearCombinedClassifier instance.
 
@@ -596,6 +644,8 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
         learning rate for the embedding variables.
       input_layer_min_slice_size: Optional. The min slice size of input layer
         partitions. If not provided, will use the default of 64M.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
       fix_global_step_increment_bug: If `False`, the estimator needs two fit
         steps to optimize both linear and dnn parts. If `True`, this bug is
         fixed. New users must set this to `True`, but it the default value is
@@ -609,7 +659,8 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
     head = head_lib.multi_class_head(
         n_classes=n_classes,
         weight_column_name=weight_column_name,
-        enable_centered_bias=enable_centered_bias)
+        enable_centered_bias=enable_centered_bias,
+        label_keys=label_keys)
     linear_feature_columns = tuple(linear_feature_columns or [])
     dnn_feature_columns = tuple(dnn_feature_columns or [])
     self._feature_columns = linear_feature_columns + dnn_feature_columns
@@ -820,9 +871,11 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
     ...
   def input_fn_eval: # returns x, y
     ...
+  def input_fn_predict: # returns x, None
+    ...
   estimator.train(input_fn_train)
   estimator.evaluate(input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(input_fn_predict)
   ```
 
   Input of `fit`, `train`, and `evaluate` should have following features,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 301211ee82233e55a340ea706ca28dc82bd7dcc0..181a8cab1ce5f325686b51db2fb4fc2c7ee35110 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -36,6 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -420,6 +421,52 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
     scores = classifier.evaluate(input_fn=_input_fn, steps=100)
     _assert_metrics_in_range(('accuracy', 'auc'), scores)
 
+  def testEstimatorWithCoreFeatureColumns(self):
+    """Tests binary classification using Tensor data as input."""
+
+    def _input_fn():
+      iris = test_data.prepare_iris_data_for_logistic_regression()
+      features = {}
+      for i in range(4):
+        # The following shows how to provide the Tensor data for
+        # RealValuedColumns.
+        features.update({
+            str(i):
+                array_ops.reshape(
+                    constant_op.constant(iris.data[:, i], dtype=dtypes.float32),
+                    [-1, 1])
+        })
+      # The following shows how to provide the SparseTensor data for
+      # a SparseColumn.
+      features['dummy_sparse_column'] = sparse_tensor.SparseTensor(
+          values=['en', 'fr', 'zh'],
+          indices=[[0, 0], [0, 1], [60, 0]],
+          dense_shape=[len(iris.target), 2])
+      labels = array_ops.reshape(
+          constant_op.constant(iris.target, dtype=dtypes.int32), [-1, 1])
+      return features, labels
+
+    iris = test_data.prepare_iris_data_for_logistic_regression()
+    cont_features = [fc_core.numeric_column(str(i)) for i in range(4)]
+    linear_features = [
+        fc_core.bucketized_column(
+            cont_features[i],
+            sorted(set(test_data.get_quantile_based_buckets(
+                iris.data[:, i], 10)))) for i in range(4)
+    ]
+    linear_features.append(
+        fc_core.categorical_column_with_hash_bucket(
+            'dummy_sparse_column', hash_bucket_size=100))
+
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=linear_features,
+        dnn_feature_columns=cont_features,
+        dnn_hidden_units=[3, 3])
+
+    classifier.fit(input_fn=_input_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_input_fn, steps=100)
+    _assert_metrics_in_range(('accuracy', 'auc'), scores)
+
   def testTrainWithPartitionedVariables(self):
     """Tests training with partitioned variables."""
 
@@ -493,6 +540,59 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
         input_fn=test_data.iris_input_multiclass_fn, steps=100)
     _assert_metrics_in_range(('accuracy',), scores)
 
+  def testMultiClassLabelKeys(self):
+    """Tests n_classes > 2 with label_keys vocabulary for labels."""
+    # Byte literals needed for python3 test to pass.
+    label_keys = [b'label0', b'label1', b'label2']
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      labels = constant_op.constant(
+          [[label_keys[1]], [label_keys[0]], [label_keys[0]]],
+          dtype=dtypes.string)
+      return features, labels
+
+    language_column = feature_column.sparse_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        n_classes=3,
+        linear_feature_columns=[language_column],
+        dnn_feature_columns=[
+            feature_column.embedding_column(
+                language_column, dimension=1),
+            feature_column.real_valued_column('age')
+        ],
+        dnn_hidden_units=[3, 3],
+        label_keys=label_keys)
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    _assert_metrics_in_range(('accuracy',), scores)
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(
+            input_fn=predict_input_fn, as_iterable=True))
+    self.assertEqual(3, len(predicted_classes))
+    for pred in predicted_classes:
+      self.assertIn(pred, label_keys)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLoss(self):
     """Tests loss calculation."""
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 193091511990d8c947f39a1e1d87f454207e6db0..615af24cd306d1b384b4668b91d715f251ffee01 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -38,6 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -325,6 +326,49 @@ class DNNClassifierTest(test.TestCase):
       for i in range(expected_n_classes):
         self._assertInRange(0.0, 1.0, probabilities[b][i])
 
+  def testEstimatorWithCoreFeatureColumns(self):
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+
+    language_column = fc_core.categorical_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+    feature_columns = [
+        fc_core.embedding_column(language_column, dimension=1),
+        fc_core.numeric_column('age')
+    ]
+
+    classifier = dnn.DNNClassifier(
+        n_classes=2,
+        feature_columns=feature_columns,
+        hidden_units=[10, 10],
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(input_fn=predict_input_fn, as_iterable=True))
+    self._assertBinaryPredictions(3, predicted_classes)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLogisticRegression_TensorData(self):
     """Tests binary classification using tensor data as input."""
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
index cf80dafc37c21b1ec9b86eae18cf86f7115006f1..1724d7599d09873f969555cc9382c0753eba463f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.framework import deprecated
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
@@ -116,7 +115,7 @@ def dict_to_state_tuple(input_dict, cell):
 
 
 def _concatenate_context_input(sequence_input, context_input):
-  """Replicates `context_input` accross all timesteps of `sequence_input`.
+  """Replicates `context_input` across all timesteps of `sequence_input`.
 
   Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
   This value is appended to `sequence_input` on dimension 2 and the result is
@@ -178,7 +177,7 @@ def build_sequence_input(features,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features i.e. features that apply accross all time
+      describing context features i.e. features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     weight_collections: List of graph collections to which weights are added.
@@ -420,7 +419,7 @@ def _get_dynamic_rnn_model_fn(
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     predict_probabilities: A boolean indicating whether to predict probabilities
@@ -598,13 +597,13 @@ class DynamicRnnEstimator(estimator.Estimator):
         `ProblemType.CLASSIFICATION` or `ProblemType.LINEAR_REGRESSION`.
       prediction_type: whether the `Estimator` should return a value for each
         step in the sequence, or just a single value for the final time step.
-        Must be one of `ProblemType.SINGLE_VALUE` or
-        `ProblemType.MULTIPLE_VALUE`.
+        Must be one of `PredictionType.SINGLE_VALUE` or
+        `PredictionType.MULTIPLE_VALUE`.
       sequence_feature_columns: An iterable containing all the feature columns
         describing sequence features. All items in the iterable should be
         instances of classes derived from `FeatureColumn`.
       context_feature_columns: An iterable containing all the feature columns
-        describing context features, i.e., features that apply accross all time
+        describing context features, i.e., features that apply across all time
         steps. All items in the set should be instances of classes derived from
         `FeatureColumn`.
       num_classes: the number of classes for a classification problem. Only
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index 61a6168a9eb09c8915a11219ea20c4c77d9884ad..6fc028ab7069eaca46a736f1e96b36e31771a3bd 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -38,8 +38,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -157,7 +157,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
         self.context_feature_columns)
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.tables_initializer())
+      sess.run(lookup_ops.tables_initializer())
       sequence_input_val = sess.run(sequence_input)
     expected_shape = np.array([
         3,  # expected batch size
@@ -178,7 +178,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
     # Obtain values of activations and final state.
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.tables_initializer())
+      sess.run(lookup_ops.tables_initializer())
       activations, final_state = sess.run([activations_t, final_state_t])
 
     expected_activations_shape = np.array([3, 2, self.NUM_LABEL_COLUMNS])
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 8a92809a0cec6fe8a41d56ef27d8ec524c36d992..9710e08029f5c95f9dd75121bf018f2b1d203b3b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -40,6 +40,7 @@ from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import monitors as monitor_lib
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn as sklearn
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import metric_key
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 from tensorflow.contrib.learn.python.learn.estimators import run_config
@@ -56,7 +57,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -330,13 +331,19 @@ def _write_dict_to_summary(output_dir,
   for key in dictionary:
     if dictionary[key] is None:
       continue
+    if key  == "global_step":
+      continue
     value = summary_proto.value.add()
     value.tag = key
-    if (isinstance(dictionary[key], np.float32) or
+    if (isinstance(dictionary[key], np.float32) or 
         isinstance(dictionary[key], float)):
       value.simple_value = float(dictionary[key])
+    elif (isinstance(dictionary[key], np.int64) or
+          isinstance(dictionary[key], np.int32) or
+          isinstance(dictionary[key], int)):
+      value.simple_value = int(dictionary[key])
     else:
-      logging.warn('Skipping summary for %s, must be a float or np.float32.',
+      logging.warn('Skipping summary for %s, must be a float, np.float32, np.int64, np.int32 or int.',
                    key)
   summary_writer.add_summary(summary_proto, current_global_step)
   summary_writer.flush()
@@ -371,7 +378,6 @@ class BaseEstimator(
       logging.info('Using default config.')
     else:
       self._config = config
-    logging.info('Using config: %s', str(vars(self._config)))
 
     if self._config.session_config is None:
       self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
@@ -396,6 +402,7 @@ class BaseEstimator(
                       self._model_dir)
     if self._config.model_dir is None:
       self._config = self._config.replace(model_dir=self._model_dir)
+    logging.info('Using config: %s', str(vars(self._config)))
 
     # Set device function depending if there are replicas or not.
     self._device_fn = _get_replica_device_setter(self._config)
@@ -965,7 +972,8 @@ class BaseEstimator(
             saver.Saver(
                 sharded=True,
                 max_to_keep=self._config.keep_checkpoint_max,
-                defer_build=True))
+                defer_build=True,
+                save_relative_paths=True))
 
       chief_hooks = []
       if (self._config.save_checkpoints_secs or
@@ -1083,8 +1091,9 @@ class Estimator(BaseEstimator):
       # Check number of arguments of the given function matches requirements.
       model_fn_args = _model_fn_args(model_fn)
       if params is not None and 'params' not in model_fn_args:
-        raise ValueError('Estimator\'s model_fn (%s) has less than 4 '
-                         'arguments, but not None params (%s) are passed.' %
+        raise ValueError('Estimator\'s model_fn (%s) does not have a params '
+                         'argument, but params (%s) were passed to the '
+                         'Estimator\'s constructor.' %
                          (model_fn, params))
       if params is None and 'params' in model_fn_args:
         logging.warning('Estimator\'s model_fn (%s) includes params '
@@ -1251,6 +1260,13 @@ class Estimator(BaseEstimator):
       input_alternatives, features = (
           saved_model_export_utils.get_input_alternatives(input_ops))
 
+      # TODO(b/34388557) This is a stopgap, pending recording model provenance.
+      # Record which features are expected at serving time.  It is assumed that
+      # these are the features that were used in training.
+      for feature_key in input_ops.features.keys():
+        ops.add_to_collection(
+            constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS, feature_key)
+
       # Call the model_fn and collect the output alternatives.
       model_fn_ops = self._call_model_fn(features, None,
                                          model_fn_lib.ModeKeys.INFER)
@@ -1279,14 +1295,11 @@ class Estimator(BaseEstimator):
       else:
         saver_for_restore = saver.Saver(sharded=True)
       with tf_session.Session('') as session:
-        variables.initialize_local_variables()
-        data_flow_ops.tables_initializer()
-        resources.initialize_resources(resources.shared_resources())
         saver_for_restore.restore(session, checkpoint_path)
         init_op = control_flow_ops.group(
             variables.local_variables_initializer(),
             resources.initialize_resources(resources.shared_resources()),
-            data_flow_ops.tables_initializer())
+            lookup_ops.tables_initializer())
 
         # Perform the export
         builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 6e10fdb97760ae8e58229d2b1ecdb337998bcd9b..c95df75356b70663180c5e3fbb5bb5b6d84aeffa 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -28,6 +28,8 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from google.protobuf import text_format
+
 from tensorflow.contrib import learn
 from tensorflow.contrib import lookup
 from tensorflow.contrib.framework.python.ops import variables
@@ -38,6 +40,7 @@ from tensorflow.contrib.learn.python.learn import models
 from tensorflow.contrib.learn.python.learn import monitors as monitors_lib
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import linear
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
@@ -49,6 +52,7 @@ from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -60,6 +64,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import checkpoint_state_pb2
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver as saver_lib
@@ -673,6 +678,38 @@ class EstimatorTest(test.TestCase):
         metrics={'MSE': metric_ops.streaming_mean_squared_error})
     self.assertLess(scores3['MSE'], scores['MSE'])
 
+  def test_checkpoint_contains_relative_paths(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(
+        model_dir=tmpdir,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    est.fit(input_fn=boston_input_fn, steps=5)
+
+    checkpoint_file_content = file_io.read_file_to_string(
+        os.path.join(tmpdir, 'checkpoint'))
+    ckpt = checkpoint_state_pb2.CheckpointState()
+    text_format.Merge(checkpoint_file_content, ckpt)
+    self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
+    self.assertAllEqual(
+        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+
+  def test_train_save_copy_reload(self):
+    tmpdir = tempfile.mkdtemp()
+    model_dir1 = os.path.join(tmpdir, 'model_dir1')
+    est1 = estimator.Estimator(
+        model_dir=model_dir1,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    est1.fit(input_fn=boston_input_fn, steps=5)
+
+    model_dir2 = os.path.join(tmpdir, 'model_dir2')
+    os.renames(model_dir1, model_dir2)
+    est2 = estimator.Estimator(
+        model_dir=model_dir2,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    self.assertEqual(5, est2.get_variable_value('global_step'))
+    est2.fit(input_fn=boston_input_fn, steps=5)
+    self.assertEqual(10, est2.get_variable_value('global_step'))
+
   def testEstimatorParams(self):
     boston = base.load_boston()
     est = estimator.SKCompat(
@@ -909,6 +946,10 @@ class EstimatorTest(test.TestCase):
         self.assertTrue('input_example_tensor' in graph_ops)
         self.assertTrue('ParseExample/ParseExample' in graph_ops)
         self.assertTrue('linear/linear/feature/matmul' in graph_ops)
+        self.assertSameElements(
+            ['bogus_lookup', 'feature'],
+            graph.get_collection(
+                constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS))
 
     # cleanup
     gfile.DeleteRecursively(tmpdir)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 12af78398b25ac67789f736c6b95c59f67298c49..e4ef6996d8d707d3d18f4144c802a7b3f96b817d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -37,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sparse_ops
@@ -162,10 +163,10 @@ class Head(object):
           ModeFnOps.loss to compute and apply gradients.
       logits: logits `Tensor` to be used by the head.
       logits_input: `Tensor` from which to build logits, often needed when you
-        don't want to compute the logits. Typicaly this is the activation of the
-        last hidden layer in a DNN. Some heads (like the ones responsible for
-        candidate sampling) intrinsically avoid computing full logits and only
-        accepts logits_input.
+        don't want to compute the logits. Typically this is the activation of
+        the last hidden layer in a DNN. Some heads (like the ones responsible
+        for candidate sampling) intrinsically avoid computing full logits and
+        only accepts logits_input.
       scope: Optional scope for `variable_scope`.
 
     Returns:
@@ -378,7 +379,12 @@ def multi_label_head(n_classes,
                      loss_fn=None):
   """Creates a Head for multi label classification.
 
-  The Head uses sigmoid cross entropy loss.
+  Multi-label classification handles the case where each example may have zero
+  or more associated labels, from a discrete set.  This is distinct from
+  `multi_class_head` which has exactly one label from a discrete set.
+
+  This head by default uses sigmoid cross entropy loss, which expects as input
+  a multi-hot tensor of shape `(batch_size, num_classes)`.
 
   Args:
     n_classes: Integer, number of classes, must be >= 2
@@ -613,7 +619,9 @@ def _create_model_fn_ops(features,
   if (mode != model_fn.ModeKeys.INFER) and (labels is not None):
     weight_tensor = _weight_tensor(features, weight_column_name)
     loss, weighted_average_loss = loss_fn(labels, logits, weight_tensor)
-    summary.scalar(
+    # Uses the deprecated API to set the tag explicitly.
+    # Without it, trianing and eval losses will show up in different graphs.
+    logging_ops.scalar_summary(
         _summary_key(head_name, mkey.LOSS), weighted_average_loss)
 
     if mode == model_fn.ModeKeys.TRAIN:
@@ -918,12 +926,21 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None):
     if not labels.dtype.is_integer:
       raise ValueError("Labels dtype should be integer "
                        "Instead got %s." % labels.dtype)
-    # TODO(ptucker): This will break for dynamic shapes.
+
     # sparse_softmax_cross_entropy_with_logits requires [batch_size] labels.
+    is_squeezed_labels = False
+    # TODO(ptucker): This will break for dynamic shapes.
     if len(labels.get_shape()) == 2:
       labels = array_ops.squeeze(labels, squeeze_dims=(1,))
+      is_squeezed_labels = True
+
     loss = nn.sparse_softmax_cross_entropy_with_logits(
         labels=labels, logits=logits, name=name)
+
+    # Restore squeezed dimension, if necessary, so loss matches weights shape.
+    if is_squeezed_labels:
+      loss = array_ops.expand_dims(loss, axis=(1,))
+
     return _compute_weighted_loss(loss, weights)
 
 
@@ -1629,12 +1646,27 @@ class _MultiHead(Head):
 
 
 def _weight_tensor(features, weight_column_name):
-  """Returns weights as 1d `Tensor`."""
+  """Returns weights as `Tensor` of rank 0, or at least 2."""
   if not weight_column_name:
     return None
-  with ops.name_scope(None, "weight_tensor",
-                      tuple(six.itervalues(features))):
-    return math_ops.to_float(features[weight_column_name])
+  if weight_column_name not in features:
+    raise ValueError("Weights {} missing from features.".format(
+        weight_column_name))
+  with ops.name_scope(None, "weight_tensor", tuple(six.itervalues(features))):
+    weight_tensor = math_ops.to_float(features[weight_column_name])
+    shape = weight_tensor.get_shape()
+    rank = shape.ndims
+    # We don't bother with expanding dims of non-staticly shaped tensors or
+    # scalars, and >1d is already in a good format.
+    if rank == 1:
+      logging.warning(
+          "Weights {} has shape {}, expanding to make it 2d.",
+          weight_column_name, shape)
+      return (
+          sparse_ops.sparse_reshape(weight_tensor, (-1, 1))
+          if isinstance(weight_tensor, sparse_tensor.SparseTensor) else
+          array_ops.reshape(weight_tensor, (-1, 1)))
+    return weight_tensor
 
 
 # TODO(zakaria): This function is needed for backward compatibility and should
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index e81b15a1725790d27e7b2f9463c6327e876e272e..012b919d63147f2472ff6a4fc03f0dee7a60968a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -32,7 +32,7 @@ from tensorflow.core.framework import summary_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses as losses_lib
 from tensorflow.python.platform import test
@@ -123,7 +123,7 @@ class PoissonHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=logits)
       self._assert_output_alternatives(model_fn_ops)
-      _assert_summary_tags(self, ["regression_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       _assert_no_variables(self)
       loss = self._log_poisson_loss(logits, labels)
       _assert_metrics(self, loss, {"loss": loss}, model_fn_ops)
@@ -149,7 +149,7 @@ class RegressionHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=((1.,), (1.,), (3.,)))
       self._assert_output_alternatives(model_fn_ops)
-      _assert_summary_tags(self, ["regression_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       _assert_no_variables(self)
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
@@ -179,7 +179,7 @@ class RegressionHeadTest(test.TestCase):
       _assert_variables(
           self, expected_global=w, expected_model=w, expected_trainable=w)
       variables.global_variables_initializer().run()
-      _assert_summary_tags(self, ["regression_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       _assert_metrics(self, 2. / 3, {"loss": 2. / 3}, model_fn_ops)
 
   def testRegressionWithLogitsAndLogitsInput(self):
@@ -207,7 +207,7 @@ class RegressionHeadTest(test.TestCase):
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["regression_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
   def testRegressionWithLabelName(self):
@@ -222,23 +222,59 @@ class RegressionHeadTest(test.TestCase):
           logits=((1.,), (1.,), (3.,)))
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["regression_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
-  def testRegressionWithWeights(self):
+  def testRegressionWithScalarWeights(self):
+    head = head_lib.regression_head(weight_column_name="label_weight")
+    with ops.Graph().as_default(), session.Session():
+      weights = 2.
+      labels = ((0.,), (1.,), (1.,))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1.,), (1.,), (3.,)))
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      _assert_metrics(self, (weights * 5.) / len(labels), {
+          "loss": (weights * 5.) / (weights * len(labels))
+      }, model_fn_ops)
+
+  def testRegressionWith1DWeights(self):
+    head = head_lib.regression_head(weight_column_name="label_weight")
+    with ops.Graph().as_default(), session.Session():
+      weights = (2., 5., 0.)
+      labels = ((0.,), (1.,), (1.,))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1.,), (1.,), (3.,)))
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      _assert_metrics(self, 2. / len(labels), {"loss": 2. / np.sum(weights)},
+                      model_fn_ops)
+
+  def testRegressionWith2DWeights(self):
     head = head_lib.regression_head(weight_column_name="label_weight")
     with ops.Graph().as_default(), session.Session():
       weights = ((2.,), (5.,), (0.,))
+      labels = ((0.,), (1.,), (1.,))
       model_fn_ops = head.create_model_fn_ops(
           features={"label_weight": weights},
-          labels=((0.,), (1.,), (1.,)),
+          labels=labels,
           mode=model_fn.ModeKeys.TRAIN,
           train_op_fn=head_lib.no_op_train_fn,
           logits=((1.,), (1.,), (3.,)))
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["regression_head/loss"])
-      _assert_metrics(self, 2. / len(weights), {"loss": 2. / np.sum(weights)},
+      _assert_summary_tags(self, ["loss"])
+      _assert_metrics(self, 2. / len(labels), {"loss": 2. / np.sum(weights)},
                       model_fn_ops)
 
   def testRegressionWithCenteredBias(self):
@@ -260,7 +296,7 @@ class RegressionHeadTest(test.TestCase):
           expected_trainable=("regression_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
       _assert_summary_tags(self, [
-          "regression_head/loss",
+          "loss",
           "regression_head/centered_bias/bias_0"
       ])
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
@@ -331,7 +367,7 @@ class MultiLabelHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .89985204
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -348,7 +384,7 @@ class MultiLabelHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn, logits=logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = 1.00320443
       _assert_metrics(self, expected_loss, {
           "accuracy": 0.,
@@ -388,7 +424,7 @@ class MultiLabelHeadTest(test.TestCase):
       _assert_variables(
           self, expected_global=w, expected_model=w, expected_trainable=w)
       variables.global_variables_initializer().run()
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .69314718
       _assert_metrics(self, expected_loss, {
           "accuracy": 2. / 3,
@@ -433,7 +469,7 @@ class MultiLabelHeadTest(test.TestCase):
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .89985204
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -452,7 +488,7 @@ class MultiLabelHeadTest(test.TestCase):
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = 1.377779
       expected_eval_metrics = {
           "accuracy": 1. / 3,
@@ -520,12 +556,12 @@ class MultiLabelHeadTest(test.TestCase):
           head_lib.no_op_train_fn, logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .89985204
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testMultiLabelWithWeight(self):
+  def testMultiLabelWithScalarWeight(self):
     n_classes = 3
     head = head_lib.multi_label_head(
         n_classes=n_classes,
@@ -540,11 +576,27 @@ class MultiLabelHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       _assert_metrics(self, .089985214,
                       self._expected_eval_metrics(.89985214), model_fn_ops)
 
-  def testMultiLabelWithMultiDimensionalWeight(self):
+  def testMultiLabelWith1DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_label_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      with self.assertRaisesRegexp(
+          ValueError, "weights can not be broadcast to values"):
+        head.create_model_fn_ops(
+            features={"label_weight": (.1, .1, .1)},
+            labels=self._labels,
+            mode=model_fn.ModeKeys.TRAIN,
+            train_op_fn=head_lib.no_op_train_fn,
+            logits=self._logits)
+
+  def testMultiLabelWith2DWeight(self):
     n_classes = 3
     head = head_lib.multi_label_head(
         n_classes=n_classes,
@@ -559,7 +611,7 @@ class MultiLabelHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       _assert_metrics(self, .089985214,
                       self._expected_eval_metrics(.89985214), model_fn_ops)
 
@@ -579,7 +631,7 @@ class MultiLabelHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .089985214
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -604,7 +656,7 @@ class MultiLabelHeadTest(test.TestCase):
           expected_trainable=("multi_label_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
       _assert_summary_tags(self, (
-          "multi_label_head/loss",
+          "loss",
           "multi_label_head/centered_bias/bias_0",
           "multi_label_head/centered_bias/bias_1",
           "multi_label_head/centered_bias/bias_2"
@@ -629,7 +681,7 @@ class MultiLabelHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=self._logits)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_label_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .89985204
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -695,7 +747,7 @@ class BinaryClassificationHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["binary_logistic_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .81326175
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -723,7 +775,7 @@ class BinaryClassificationHeadTest(test.TestCase):
       _assert_variables(
           self, expected_global=w, expected_model=w, expected_trainable=w)
       variables.global_variables_initializer().run()
-      _assert_summary_tags(self, ["binary_logistic_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .69314718
       label_mean = np.mean(self._labels)
       _assert_metrics(self, expected_loss, {
@@ -759,7 +811,7 @@ class BinaryClassificationHeadTest(test.TestCase):
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["binary_logistic_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .81326175
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -791,7 +843,7 @@ class BinaryClassificationHeadTest(test.TestCase):
             [b"0", b"1"], predicted_classes[0])
         self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
 
-  def testBinaryClassificationInferMode_withWightColumn(self):
+  def testBinaryClassificationInferMode_withWeightColumn(self):
     n_classes = 2
     head = head_lib.multi_class_head(n_classes=n_classes,
                                      weight_column_name="label_weight")
@@ -838,12 +890,47 @@ class BinaryClassificationHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["binary_logistic_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = .81326175
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testBinaryClassificationWithWeights(self):
+  def testBinaryClassificationWith1DWeights(self):
+    n_classes = 2
+    head = head_lib.multi_class_head(
+        n_classes=n_classes, weight_column_name="label_weight")
+    with ops.Graph().as_default(), session.Session():
+      weights = (1., 0.)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_total_loss = .31326166
+      _assert_metrics(
+          self,
+          expected_total_loss / len(weights),
+          {
+              "accuracy": 1. / 1,
+              "accuracy/baseline_label_mean": 1. / 1,
+              "accuracy/threshold_0.500000_mean": 1. / 1,
+              "auc": 0. / 1,
+              "labels/actual_label_mean": 1. / 1,
+              "labels/prediction_mean": .731059,  # softmax
+              # eval loss is weighted loss divided by sum of weights.
+              "loss": expected_total_loss,
+              "precision/positive_threshold_0.500000_mean": 1. / 1,
+              "recall/positive_threshold_0.500000_mean": 1. / 1,
+          },
+          model_fn_ops)
+
+  def testBinaryClassificationWith2DWeights(self):
     n_classes = 2
     head = head_lib.multi_class_head(
         n_classes=n_classes, weight_column_name="label_weight")
@@ -859,7 +946,7 @@ class BinaryClassificationHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["binary_logistic_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_total_loss = .31326166
       _assert_metrics(
           self,
@@ -892,10 +979,10 @@ class BinaryClassificationHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["binary_logistic_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
-      # expected_loss is (total_weighted_loss)/1 since htere is 1 nonzero
+      # expected_loss is (total_weighted_loss)/1 since there is 1 nonzero
       # weight.
       expected_loss = 0.062652342
       _assert_metrics(
@@ -932,7 +1019,7 @@ class BinaryClassificationHeadTest(test.TestCase):
           expected_trainable=("binary_logistic_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
       _assert_summary_tags(self, [
-          "binary_logistic_head/loss",
+          "loss",
           "binary_logistic_head/centered_bias/bias_0"
       ])
       expected_loss = .81326175
@@ -951,7 +1038,7 @@ class MultiClassHeadTest(test.TestCase):
 
   def setUp(self):
     self._logits = ((1., 0., 0.),)
-    self._labels = (2,)
+    self._labels = ((2,),)
 
   def _expected_eval_metrics(self, expected_loss):
     return {
@@ -983,7 +1070,7 @@ class MultiClassHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_class_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = 1.5514447
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -1022,7 +1109,7 @@ class MultiClassHeadTest(test.TestCase):
       _assert_variables(
           self, expected_global=w, expected_model=w, expected_trainable=w)
       variables.global_variables_initializer().run()
-      _assert_summary_tags(self, ["multi_class_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = 1.0986123
       _assert_metrics(self, expected_loss, {
           "accuracy": 0.,
@@ -1073,7 +1160,7 @@ class MultiClassHeadTest(test.TestCase):
           expected_trainable=("multi_class_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
       _assert_summary_tags(self,
-                           ["multi_class_head/loss",
+                           ["loss",
                             "multi_class_head/centered_bias/bias_0",
                             "multi_class_head/centered_bias/bias_1",
                             "multi_class_head/centered_bias/bias_2"])
@@ -1091,7 +1178,7 @@ class MultiClassHeadTest(test.TestCase):
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_class_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = 1.5514447
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -1110,7 +1197,7 @@ class MultiClassHeadTest(test.TestCase):
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_class_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = 3.1698461
       expected_eval_metrics = {
           "accuracy": 0.,
@@ -1131,7 +1218,7 @@ class MultiClassHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       expected_eval_metrics, model_fn_ops)
 
-  def testMultiClassWithWeight(self):
+  def testMultiClassWithScalarWeight(self):
     n_classes = 3
     head = head_lib.multi_class_head(
         n_classes=n_classes,
@@ -1149,7 +1236,55 @@ class MultiClassHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_class_head/loss"])
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.5514447
+      _assert_metrics(self, expected_loss * weight,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
+
+  def testMultiClassWith1DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      weight = .1
+      weights = (weight,)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.5514447
+      _assert_metrics(self, expected_loss * weight,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
+
+  def testMultiClassWith2DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      weight = .1
+      weights = ((weight,),)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
       expected_loss = 1.5514447
       _assert_metrics(self, expected_loss * weight,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -1173,7 +1308,7 @@ class MultiClassHeadTest(test.TestCase):
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["multi_class_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = 1.5514447 * weight
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -1190,7 +1325,7 @@ class MultiClassHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=((1., 0., 0.), (0., 0., 1.),))
       with session.Session():
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         self.assertAllEqual(
             [0, 2],
             model_fn_ops.predictions["classes"].eval())
@@ -1242,7 +1377,7 @@ class MultiClassHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=((1., 0., 0.), (0., 0., 1.),))
       with session.Session():
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         self.assertAllEqual(
             [b"key0", b"key2"],
             model_fn_ops.predictions["classes"].eval())
@@ -1277,10 +1412,10 @@ class MultiClassHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=((1., 0., 0.),))
       with session.Session():
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         self.assertIsNone(model_fn_ops.train_op)
         _assert_no_variables(self)
-        _assert_summary_tags(self, ["multi_class_head/loss"])
+        _assert_summary_tags(self, ["loss"])
         expected_loss = 1.5514447
         expected_eval_metrics = {
             "accuracy": 0.,
@@ -1303,10 +1438,10 @@ class MultiClassHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=((0., 0., 1.),))
       with session.Session():
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         self.assertIsNone(model_fn_ops.train_op)
         _assert_no_variables(self)
-        _assert_summary_tags(self, ["multi_class_head/loss"])
+        _assert_summary_tags(self, ["loss"])
         expected_loss = 0.5514447
         expected_eval_metrics = {
             "accuracy": 1.,
@@ -1345,7 +1480,7 @@ class BinarySvmHeadTest(test.TestCase):
           logits=self._predictions)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["binary_svm_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = np.average(self._expected_losses)
       _assert_metrics(self, expected_loss, {
           "accuracy": 1.,
@@ -1375,7 +1510,7 @@ class BinarySvmHeadTest(test.TestCase):
       _assert_variables(
           self, expected_global=w, expected_model=w, expected_trainable=w)
       variables.global_variables_initializer().run()
-      _assert_summary_tags(self, ["binary_svm_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = 1.
       _assert_metrics(self, expected_loss, {
           "accuracy": .5,
@@ -1407,7 +1542,7 @@ class BinarySvmHeadTest(test.TestCase):
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["binary_svm_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = np.average(self._expected_losses)
       _assert_metrics(self, expected_loss, {
           "accuracy": 1.,
@@ -1426,14 +1561,34 @@ class BinarySvmHeadTest(test.TestCase):
           logits=self._predictions)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["binary_svm_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_loss = np.average(self._expected_losses)
       _assert_metrics(self, expected_loss, {
           "accuracy": 1.,
           "loss": expected_loss,
       }, model_fn_ops)
 
-  def testBinarySVMWithWeights(self):
+  def testBinarySVMWith1DWeights(self):
+    head = head_lib.binary_svm_head(weight_column_name="weights")
+    with ops.Graph().as_default(), session.Session():
+      weights = (7., 11.)
+      model_fn_ops = head.create_model_fn_ops(
+          # We have to add an extra dim here for weights broadcasting to work.
+          features={"weights": weights},
+          mode=model_fn.ModeKeys.TRAIN,
+          labels=self._labels,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._predictions)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_weighted_losses = np.multiply(weights, self._expected_losses)
+      _assert_metrics(self, np.mean(expected_weighted_losses), {
+          "accuracy": 1.,
+          "loss": np.sum(expected_weighted_losses) / np.sum(weights),
+      }, model_fn_ops)
+
+  def testBinarySVMWith2DWeights(self):
     head = head_lib.binary_svm_head(weight_column_name="weights")
     with ops.Graph().as_default(), session.Session():
       weights = (7., 11.)
@@ -1446,7 +1601,7 @@ class BinarySvmHeadTest(test.TestCase):
           logits=self._predictions)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
-      _assert_summary_tags(self, ["binary_svm_head/loss"])
+      _assert_summary_tags(self, ["loss"])
       expected_weighted_losses = np.multiply(weights, self._expected_losses)
       _assert_metrics(self, np.mean(expected_weighted_losses), {
           "accuracy": 1.,
@@ -1473,7 +1628,7 @@ class BinarySvmHeadTest(test.TestCase):
           expected_trainable=("binary_svm_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
       _assert_summary_tags(self, [
-          "binary_svm_head/loss",
+          "loss",
           "binary_svm_head/centered_bias/bias_0"
       ])
       expected_loss = np.average(self._expected_losses)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index cbbd9671b798e3f3d3ab82946a2bc2f2d31f5def..a473cf46d59e25e5d20e4da271a92f8249003782 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.summary import summary
 from tensorflow.python.ops.control_flow_ops import with_dependencies
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training.session_run_hook import SessionRunArgs
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index d1b4aedb81e0565cf4c8bbc85cd0baaac647f446..8a595a79016281b39a4f0f4d36083a1033085198 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -27,11 +27,13 @@ from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -148,17 +150,24 @@ def _linear_model_fn(features, labels, mode, params, config=None):
       parent_scope,
       values=tuple(six.itervalues(features)),
       partitioner=partitioner) as scope:
-    if joint_weights:
-      layer_fn = layers.joint_weighted_sum_from_feature_columns
+    if all([isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
+            for fc in feature_columns]):
+      if joint_weights:
+        layer_fn = layers.joint_weighted_sum_from_feature_columns
+      else:
+        layer_fn = layers.weighted_sum_from_feature_columns
+      logits, _, _ = layer_fn(
+          columns_to_tensors=features,
+          feature_columns=feature_columns,
+          num_outputs=head.logits_dimension,
+          weight_collections=[parent_scope],
+          scope=scope)
     else:
-      layer_fn = layers.weighted_sum_from_feature_columns
-        
-    logits, _, _ = layer_fn(
-            columns_to_tensors=features,
-            feature_columns=feature_columns,
-            num_outputs=head.logits_dimension,
-            weight_collections=[parent_scope],
-            scope=scope)
+      logits = fc_core.linear_model(
+          features=features,
+          feature_columns=feature_columns,
+          units=head.logits_dimension,
+          weight_collections=[parent_scope])
 
     def _train_op_fn(loss):
       global_step = contrib_variables.get_global_step()
@@ -333,9 +342,34 @@ class LinearClassifier(estimator.Estimator):
     ...
   def input_fn_eval: # returns x, y (where y represents label's class index).
     ...
+  def input_fn_predict: # returns x, None.
+    ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
+  # predict_classes returns class indices.
+  estimator.predict_classes(input_fn=input_fn_predict)
+  ```
+
+  If the user specifies `label_keys` in constructor, labels must be strings from
+  the `label_keys` vocabulary. Example:
+
+  ```python
+  label_keys = ['label0', 'label1', 'label2']
+  estimator = LinearClassifier(
+      n_classes=n_classes,
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
+      label_keys=label_keys)
+
+  def input_fn_train: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.fit(input_fn=input_fn_train)
+
+  def input_fn_eval: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.evaluate(input_fn=input_fn_eval)
+  def input_fn_predict: # returns x, None
+  # predict_classes returns one of label_keys.
+  estimator.predict_classes(input_fn=input_fn_predict)
   ```
 
   Input of `fit` and `evaluate` should have following features,
@@ -363,7 +397,8 @@ class LinearClassifier(estimator.Estimator):
                enable_centered_bias=False,
                _joint_weight=False,
                config=None,
-               feature_engineering_fn=None):
+               feature_engineering_fn=None,
+               label_keys=None):
     """Construct a `LinearClassifier` estimator object.
 
     Args:
@@ -398,6 +433,8 @@ class LinearClassifier(estimator.Estimator):
                         labels which are the output of `input_fn` and
                         returns features and labels which will be fed
                         into the model.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
 
     Returns:
       A `LinearClassifier` estimator.
@@ -419,7 +456,8 @@ class LinearClassifier(estimator.Estimator):
     head = head_lib.multi_class_head(
         n_classes,
         weight_column_name=weight_column_name,
-        enable_centered_bias=enable_centered_bias)
+        enable_centered_bias=enable_centered_bias,
+        label_keys=label_keys)
     params = {
         "head": head,
         "feature_columns": feature_columns,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index fc6437745283f3e1ff12b8f0d7a479d68340f982..145d5c40fa2d6072ed4b01535e8da3e9f550ec94 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -37,6 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer as sdca_optimizer_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -172,6 +173,49 @@ class LinearClassifierTest(test.TestCase):
     scores = classifier.evaluate(x=train_x, y=train_y, steps=1)
     self.assertGreater(scores['accuracy'], 0.9)
 
+  def testMultiClassLabelKeys(self):
+    """Tests n_classes > 2 with label_keys vocabulary for labels."""
+    # Byte literals needed for python3 test to pass.
+    label_keys = [b'label0', b'label1', b'label2']
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      labels = constant_op.constant(
+          [[label_keys[1]], [label_keys[0]], [label_keys[0]]],
+          dtype=dtypes.string)
+      return features, labels
+
+    language_column = feature_column_lib.sparse_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+
+    classifier = linear.LinearClassifier(
+        n_classes=3,
+        feature_columns=[language_column],
+        label_keys=label_keys)
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self.assertGreater(scores['accuracy'], 0.9)
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(
+            input_fn=predict_input_fn, as_iterable=True))
+    self.assertEqual(3, len(predicted_classes))
+    for pred in predicted_classes:
+      self.assertIn(pred, label_keys)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLogisticRegression_MatrixData(self):
     """Tests binary classification using matrix data as input."""
 
@@ -192,6 +236,32 @@ class LinearClassifierTest(test.TestCase):
     scores = classifier.evaluate(input_fn=_input_fn, steps=1)
     self.assertGreater(scores['accuracy'], 0.9)
 
+  def testEstimatorWithCoreFeatureColumns(self):
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+
+    language_column = fc_core.categorical_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+    feature_columns = [language_column, fc_core.numeric_column('age')]
+
+    classifier = linear.LinearClassifier(feature_columns=feature_columns)
+    classifier.fit(input_fn=_input_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self.assertGreater(scores['accuracy'], 0.9)
+
   def testLogisticRegression_MatrixData_Labels1D(self):
     """Same as the last test, but labels shape is [100] instead of [100, 1]."""
 
@@ -739,7 +809,7 @@ class LinearClassifierTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.4], [0.6], [0.3]]),
+              constant_op.constant([0.4, 0.6, 0.3]),
           'country':
               sparse_tensor.SparseTensor(
                   values=['IT', 'US', 'GB'],
@@ -1408,7 +1478,7 @@ class LinearRegressorTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.6], [0.8], [0.3]]),
+              constant_op.constant([0.6, 0.8, 0.3]),
           'sq_footage':
               constant_op.constant([[900.0], [700.0], [600.0]]),
           'country':
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 109c8d25e12b62154f6bb9baec99e419a6380eda..7af1c541c6cd6ea506c7d8213fd5ca0f690d733d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import copy
 import json
 import os
 
@@ -28,9 +27,24 @@ import six
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as core_run_config
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
 
+# A list of the property names in RunConfig user allows to change. They will
+# not affect the execution framework, so when execution framework checks the
+# `uid` of the RunConfig, it should be ingored.
+_DEFAULT_UID_WHITE_LIST = [
+    'tf_random_seed',
+    'save_summary_steps',
+    'save_checkpoints_steps',
+    'save_checkpoints_secs',
+    'session_config',
+    'keep_checkpoint_max',
+    'keep_checkpoint_every_n_hours',
+]
+
+
 class Environment(object):
   # For running general distributed training.
   CLOUD = 'cloud'
@@ -249,10 +263,12 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
         the feature.
       evaluation_master: the master on which to perform evaluation.
       model_dir: directory where model parameters, graph etc are saved. If
-        `None`, see `Estimator` about where the model will be saved.
+        `None`, will use `model_dir` property in `TF_CONFIG` environment
+        variable. If both are set, must have same value. If both are `None`, see
+        `Estimator` about where the model will be saved.
       session_config: a ConfigProto used to set session parameters, or None.
-         Note - using this argument, it is easy to provide settings which break
-         otherwise perfectly good models. Use with care.
+        Note - using this argument, it is easy to provide settings which break
+        otherwise perfectly good models. Use with care.
     """
     super(RunConfig, self).__init__(
         master=master, evaluation_master=evaluation_master)
@@ -280,50 +296,32 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     # create Scaffold and Saver in their model_fn to set these.
     self._keep_checkpoint_max = keep_checkpoint_max
     self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
-    self._model_dir = model_dir
-
-  def replace(self, **kwargs):
-    """Returns a new instance of `RunConfig` replacing specified properties.
-
-    Only the properties in the following list are allowed to be replaced:
-      - `model_dir`.
-
-    Args:
-      **kwargs: keyword named properties with new values.
-
-    Raises:
-      ValueError: If any property name in `kwargs` does not exist or is not
-        allowed to be replaced.
-
-    Returns:
-      a new instance of `RunConfig`.
-    """
-
-    new_copy = copy.deepcopy(self)
-
-    # TODO(b/33295821): Allow more fields to be replaced.
-    for key, new_value in six.iteritems(kwargs):
-      if key == 'model_dir':
-        new_copy._model_dir = new_value  # pylint: disable=protected-access
-        continue
-
-      raise ValueError('{} is not supported by RunConfig replace'.format(key))
-
-    return new_copy
+    self._model_dir = _get_model_dir(model_dir)
 
   @experimental
-  def uid(self):
+  def uid(self, whitelist=None):
     """Generates a 'Unique Identifier' based on all internal fields.
 
     Caller should use the uid string to check `RunConfig` instance integrity
     in one session use, but should not rely on the implementation details, which
     is subject to change.
 
+    Args:
+      whitelist: A list of the string names of the properties uid should not
+        include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which
+        includes most properites user allowes to change.
+
     Returns:
       A uid string.
     """
-    # TODO(b/33295821): Allows user to specify a whitelist.
+    if whitelist is None:
+      whitelist = _DEFAULT_UID_WHITE_LIST
+
     state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')}
+    # Pop out the keys in whitelist.
+    for k in whitelist:
+      state.pop('_' + k, None)
+
     ordered_state = collections.OrderedDict(
         sorted(state.items(), key=lambda t: t[0]))
     # For class instance without __repr__, some special cares are required.
@@ -412,3 +410,21 @@ def _get_master(cluster_spec, task_type, task_id):
   # For backwards compatibility, we return empty string if task_type was
   # not set (task_type did not previously exist).
   return ''
+
+
+def _get_model_dir(model_dir):
+  """Returns `model_dir` based user provided `model_dir` or `TF_CONFIG`."""
+
+  model_dir_in_tf_config = json.loads(
+      os.environ.get('TF_CONFIG') or '{}').get('model_dir', None)
+  if model_dir_in_tf_config is not None:
+    if model_dir is not None and model_dir_in_tf_config != model_dir:
+      raise ValueError(
+          '`model_dir` provided in RunConfig construct, if set, '
+          'must have the same value as the model_dir in TF_CONFIG. '
+          'model_dir: {}\nTF_CONFIG["model_dir"]: {}.\n'.format(
+              model_dir, model_dir_in_tf_config))
+
+    logging.info('Using model_dir in TF_CONFIG: %s', model_dir_in_tf_config)
+
+  return model_dir or model_dir_in_tf_config
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
index 14cef7cc43dc051cdbfda9243cfaa9896c3d05c4..6e2a2690ae4629b29aad1e550448d8609b20a5a4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
@@ -22,12 +22,14 @@ import copy
 import json
 
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as core_run_config
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
 TEST_DIR = "test_dir"
 ANOTHER_TEST_DIR = "another_test_dir"
+MASTER = "master_"
 RANDOM_SEED = 123
 
 patch = test.mock.patch
@@ -223,6 +225,27 @@ class RunConfigTest(test.TestCase):
     config = run_config_lib.RunConfig(model_dir=TEST_DIR)
     self.assertEqual(TEST_DIR, config.model_dir)
 
+  def test_model_dir_in_tf_config(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      run_config = run_config_lib.RunConfig()
+    self.assertEqual(TEST_DIR, run_config.model_dir)
+
+  def test_model_dir_both_in_tf_config_and_constructor(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      run_config = run_config_lib.RunConfig(model_dir=TEST_DIR)
+    self.assertEqual(TEST_DIR, run_config.model_dir)
+
+  def test_model_dir_fail_if_constructor_value_mismatch_tf_config(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      with self.assertRaisesRegexp(
+          ValueError,
+          "`model_dir` provided in RunConfig .* must have "
+          "the same value .* in TF_CONFIG"):
+        run_config_lib.RunConfig(model_dir=TEST_DIR + "/sub_dir")
+
   def test_replace(self):
     config = run_config_lib.RunConfig(
         tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
@@ -232,17 +255,8 @@ class RunConfigTest(test.TestCase):
     new_config = config.replace(model_dir=ANOTHER_TEST_DIR)
     self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
     self.assertEqual(RANDOM_SEED, new_config.tf_random_seed)
-
-    self.assertEqual(TEST_DIR, config.model_dir)
     self.assertEqual(RANDOM_SEED, config.tf_random_seed)
 
-    with self.assertRaises(ValueError):
-      # tf_random_seed is not allowed to be replaced.
-      config.replace(tf_random_seed=RANDOM_SEED)
-
-    with self.assertRaises(ValueError):
-      config.replace(some_undefined_property=RANDOM_SEED)
-
   def test_uid_for_different_configs(self):
     config = run_config_lib.RunConfig(
         tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
@@ -257,6 +271,52 @@ class RunConfigTest(test.TestCase):
     self.assertNotEqual(expected_uid, new_config.uid())
     self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
 
+  def test_uid_for_whitelist(self):
+    whitelist = ["model_dir"]
+    config = run_config_lib.RunConfig(
+        tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+
+    expected_uid = config.uid(whitelist)
+    self.assertEqual(expected_uid, config.uid(whitelist))
+
+    new_config = config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertEqual(TEST_DIR, config.model_dir)
+    self.assertEqual(expected_uid, new_config.uid(whitelist))
+    self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
+
+  def test_uid_for_default_whitelist(self):
+    config = run_config_lib.RunConfig(
+        tf_random_seed=11,
+        save_summary_steps=12,
+        save_checkpoints_steps=13,
+        save_checkpoints_secs=14,
+        session_config=config_pb2.ConfigProto(allow_soft_placement=True),
+        keep_checkpoint_max=16,
+        keep_checkpoint_every_n_hours=17)
+    self.assertEqual(11, config.tf_random_seed)
+    self.assertEqual(12, config.save_summary_steps)
+    self.assertEqual(13, config.save_checkpoints_steps)
+    self.assertEqual(14, config.save_checkpoints_secs)
+    self.assertEqual(config_pb2.ConfigProto(allow_soft_placement=True),
+                     config.session_config)
+    self.assertEqual(16, config.keep_checkpoint_max)
+    self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+
+    new_config = run_config_lib.RunConfig(
+        tf_random_seed=21,
+        save_summary_steps=22,
+        save_checkpoints_steps=23,
+        save_checkpoints_secs=24,
+        session_config=config_pb2.ConfigProto(allow_soft_placement=False),
+        keep_checkpoint_max=26,
+        keep_checkpoint_every_n_hours=27)
+    self.assertEqual(config.uid(), new_config.uid())
+    # model_dir is not on the default whitelist.
+    self.assertNotEqual(config.uid(whitelist=[]),
+                        new_config.uid(whitelist=[]))
+    new_config = new_config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertNotEqual(config.uid(), new_config.uid())
+
   def test_uid_for_deepcopy(self):
     tf_config = {
         "cluster": {
diff --git a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
index 64a97880c350e0d157ce4eaa170cd083104dcd9d..9cb4c3515a96c48d3c9ca53249e68096c5b26dcf 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.contrib import layers
 from tensorflow.contrib import rnn as rnn_cell
-from tensorflow.contrib.framework.python.framework import deprecated
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.estimators import constants
@@ -145,7 +144,7 @@ def _prepare_features_for_sqss(features, labels, mode,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
 
@@ -262,7 +261,7 @@ def _read_batch(cell,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     num_threads: The Python integer number of threads enqueuing input examples
@@ -421,7 +420,7 @@ def _get_rnn_model_fn(cell_type,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     predict_probabilities: A boolean indicating whether to predict probabilities
@@ -564,7 +563,7 @@ class StateSavingRnnEstimator(estimator.Estimator):
         describing sequence features. All items in the set should be instances
         of classes derived from `FeatureColumn`.
       context_feature_columns: An iterable containing all the feature columns
-        describing context features, i.e., features that apply accross all time
+        describing context features, i.e., features that apply across all time
         steps. All items in the set should be instances of classes derived from
         `FeatureColumn`.
       num_classes: The number of classes for categorization. Used only and
diff --git a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
index e7470a544f06974b5aa959e0d0bca97121873e15..442247409dbc49052466c8b476be2ad1c840a814 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
@@ -35,8 +35,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -55,7 +55,7 @@ class PrepareInputsForRnnTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.initialize_all_tables())
+      sess.run(lookup_ops.tables_initializer())
       features_val = sess.run(features_by_time)
       self.assertAllEqual(expected, features_val)
 
@@ -316,7 +316,7 @@ class StateSavingRnnEstimatorTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.initialize_all_tables())
+      sess.run(lookup_ops.tables_initializer())
       actual_sequence, actual_context = sess.run(
           [sequence, context])
       assert_equal(expected_sequence, actual_sequence)
@@ -455,6 +455,7 @@ class LegacyConstructorTest(test.TestCase):
       return {'inputs': inputs}, labels
     return input_fn
 
+
 # TODO(jtbates): move all tests below to a benchmark test.
 class StateSavingRNNEstimatorLearningTest(test.TestCase):
   """Learning tests for state saving RNN Estimators."""
@@ -524,7 +525,7 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
     num_classes = 2
     num_unroll = 32
     sequence_length = 32
-    train_steps = 200
+    train_steps = 300
     eval_steps = 20
     num_units = [4]
     learning_rate = 0.5
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm_test.py b/tensorflow/contrib/learn/python/learn/estimators/svm_test.py
index ccb33cae1e570db29528812903045f6b384c6f2a..f67f181d1ad629825aa7834f44199409cf15f774 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm_test.py
@@ -59,9 +59,9 @@ class SVMTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2', '3']),
-          'feature1': constant_op.constant([[0.5], [1.0], [1.0]]),
-          'feature2': constant_op.constant([[1.0], [-1.0], [0.5]]),
-      }, constant_op.constant([[1], [0], [1]])
+          'feature1': constant_op.constant([0.5, 1.0, 1.0]),
+          'feature2': constant_op.constant([1.0, -1.0, 0.5]),
+      }, constant_op.constant([1, 0, 1])
 
     feature1 = feature_column.real_valued_column('feature1')
     feature2 = feature_column.real_valued_column('feature2')
@@ -142,7 +142,7 @@ class SVMTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2', '3']),
-          'feature1': constant_op.constant([[0.5], [1.0], [1.0]]),
+          'feature1': constant_op.constant([0.5, 1.0, 1.0]),
           'feature2': constant_op.constant([[1.0], [-1.0], [0.5]]),
       }, constant_op.constant([[1], [0], [1]])
 
@@ -223,7 +223,7 @@ class SVMTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.6], [0.8], [0.3]]),
+              constant_op.constant([0.6, 0.8, 0.3]),
           'sq_footage':
               constant_op.constant([[900.0], [700.0], [600.0]]),
           'country':
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 602d33e5f9b01adf133e1b8768a9bcc5be26159b..d82bc321e7634421b2192f4cf406c46f01f67efc 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -511,6 +511,8 @@ class Experiment(object):
     (via constructor). The model will be first trained for
     `train_steps_per_iteration`, and then be evaluated in turns.
 
+    This method is intended for single machine usage.
+
     This differs from `train_and_evaluate` as follows:
       1. The procedure will have train and evaluation in turns. The model
       will be trained for a number of steps (usuallly smaller than `train_steps`
@@ -647,6 +649,10 @@ class Experiment(object):
     if _sentinel is not None:
       raise ValueError("_call_train should be called with keyword args only")
 
+    # Estimator in core cannot work with monitors. We need to convert them
+    # to hooks. For Estimator in contrib, it is converted internally. So, it is
+    # safe to convert for both cases.
+    hooks = monitors.replace_monitors_with_hooks(hooks, self._estimator)
     if self._core_estimator_used:
       return self._estimator.train(input_fn=input_fn,
                                    steps=steps,
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index 4b5f3a195ce45e16a701bb935a7c8f6de55d912c..17feeb273625947fc3d59f0b3de71d08848c95e0 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -22,9 +22,9 @@ import os
 import tempfile
 import time
 
+from tensorflow.contrib.learn.python.learn import estimator as estimator_lib
 from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import experiment
-from tensorflow.contrib.learn.python.learn import monitors
 from tensorflow.contrib.learn.python.learn import run_config
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
@@ -39,6 +39,7 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_inspect
 
 
 class SheepCounter(object):
@@ -120,6 +121,15 @@ class TestBaseEstimator(object):
         compat.as_bytes(export_dir_base), compat.as_bytes('bogus_timestamp'))
 
 
+def _check_method_supports_args(method, kwargs):
+  """Checks that the given method supports the given args."""
+  supported_args = tuple(tf_inspect.getargspec(method).args)
+  for kwarg in kwargs:
+    if kwarg not in supported_args:
+      raise ValueError(
+          'Argument `{}` is not supported in method {}.'.format(kwarg, method))
+
+
 class TestEstimator(
     TestBaseEstimator, evaluable.Evaluable, trainable.Trainable):
 
@@ -127,9 +137,12 @@ class TestEstimator(
     super(TestEstimator, self).__init__(config, max_evals, eval_dict)
     tf_logging.info('Create Estimator')
 
+  def evaluate(self, **kwargs):
+    _check_method_supports_args(evaluable.Evaluable.evaluate, kwargs)
+    return super(TestEstimator, self).evaluate(**kwargs)
+
   def fit(self, **kwargs):
-    if 'hooks' in kwargs:
-      raise ValueError('`hooks` is defined in core Estimator')
+    _check_method_supports_args(trainable.Trainable.fit, kwargs)
     if 'monitors' in kwargs:
       self.monitors = kwargs['monitors']
     return super(TestEstimator, self).train(**kwargs)
@@ -137,6 +150,13 @@ class TestEstimator(
   def train(self, **kwargs):
     raise ValueError('`train` is not defined in Estimator.')
 
+  def export_savedmodel(
+      self, export_dir_base, serving_input_fn, **kwargs):
+    _check_method_supports_args(
+        estimator_lib.Estimator.export_savedmodel, kwargs)
+    return super(TestEstimator, self).export_savedmodel(
+        export_dir_base, serving_input_fn, **kwargs)
+
 
 class TestCoreEstimator(TestBaseEstimator, core_estimator.Estimator):
 
@@ -145,17 +165,22 @@ class TestCoreEstimator(TestBaseEstimator, core_estimator.Estimator):
     tf_logging.info('Create Core Estimator')
 
   def evaluate(self, **kwargs):
-    if 'eval_metrics' in kwargs:
-      raise ValueError('`eval_metrics` is not defined in core Estimator')
+    _check_method_supports_args(core_estimator.Estimator.evaluate, kwargs)
     return super(TestCoreEstimator, self).evaluate(**kwargs)
 
   def train(self, **kwargs):
-    if 'monitors' in kwargs:
-      raise ValueError('`monitors` is not defined in core Estimator')
+    _check_method_supports_args(core_estimator.Estimator.train, kwargs)
     if 'hooks' in kwargs:
       self.monitors = kwargs['hooks']
     return super(TestCoreEstimator, self).train(**kwargs)
 
+  def export_savedmodel(
+      self, export_dir_base, serving_input_receiver_fn, **kwargs):
+    _check_method_supports_args(
+        core_estimator.Estimator.export_savedmodel, kwargs)
+    return super(TestCoreEstimator, self).export_savedmodel(
+        export_dir_base, serving_input_receiver_fn, **kwargs)
+
 
 class _NoopHook(session_run_hook.SessionRunHook):
   pass
@@ -185,6 +210,23 @@ class ExperimentTest(test.TestCase):
           eval_input_fn='eval_input',
           eval_metrics='eval_metrics')
 
+  def test_default_output_alternative_key_core_estimator(self):
+    est = TestCoreEstimator()
+    export_strategy = saved_model_export_utils.make_export_strategy(
+        est,
+        default_output_alternative_key='export_key',
+        exports_to_keep=None)
+    ex = experiment.Experiment(
+        est,
+        train_input_fn='train_input',
+        eval_input_fn='eval_input',
+        train_steps=100,
+        eval_steps=100,
+        export_strategies=export_strategy)
+    with self.assertRaisesRegexp(
+        ValueError, 'default_output_alternative_key is not supported'):
+      ex.train_and_evaluate()
+
   def test_train(self):
     for est in self._estimators_for_tests():
       eval_metrics = 'eval_metrics' if not isinstance(
@@ -461,7 +503,8 @@ class ExperimentTest(test.TestCase):
       self.assertEqual(1, est.eval_count)
       self.assertEqual(1, len(est.monitors))
       self.assertEqual([noop_hook], est.eval_hooks)
-      self.assertTrue(isinstance(est.monitors[0], monitors.ValidationMonitor))
+      self.assertTrue(isinstance(est.monitors[0],
+                                 session_run_hook.SessionRunHook))
 
   def test_train_hooks_extend_does_not_mutate_input_hooks(self):
     for est in self._estimators_for_tests():
@@ -508,7 +551,9 @@ class ExperimentTest(test.TestCase):
       eval_metrics = 'eval_metrics' if not isinstance(
           est, core_estimator.Estimator) else None
       export_strategy_1 = saved_model_export_utils.make_export_strategy(
-          est, 'export_input_1', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_1',
+          exports_to_keep=None)
 
       ex = experiment.Experiment(
           est,
@@ -531,9 +576,13 @@ class ExperimentTest(test.TestCase):
       # After reset with list, the count should increase with the number of
       # items.
       export_strategy_2 = saved_model_export_utils.make_export_strategy(
-          est, 'export_input_2', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_2',
+          exports_to_keep=None)
       export_strategy_3 = saved_model_export_utils.make_export_strategy(
-          est, 'export_input_3', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_3',
+          exports_to_keep=None)
 
       old_es = ex.reset_export_strategies(
           [export_strategy_2, export_strategy_3])
@@ -547,7 +596,9 @@ class ExperimentTest(test.TestCase):
           est, core_estimator.Estimator) else None
       noop_hook = _NoopHook()
       export_strategy = saved_model_export_utils.make_export_strategy(
-          est, 'export_input', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
@@ -563,7 +614,8 @@ class ExperimentTest(test.TestCase):
       self.assertEqual(1, est.export_count)
       self.assertEqual(1, len(est.monitors))
       self.assertEqual([noop_hook], est.eval_hooks)
-      self.assertTrue(isinstance(est.monitors[0], monitors.ValidationMonitor))
+      self.assertTrue(isinstance(est.monitors[0],
+                                 session_run_hook.SessionRunHook))
 
   def test_train_and_evaluate_with_no_eval_during_training(self):
     for est in self._estimators_for_tests():
@@ -624,7 +676,9 @@ class ExperimentTest(test.TestCase):
           est, core_estimator.Estimator) else None
       noop_hook = _NoopHook()
       export_strategy = saved_model_export_utils.make_export_strategy(
-          est, 'export_input', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
@@ -645,7 +699,9 @@ class ExperimentTest(test.TestCase):
       eval_metrics = 'eval_metrics' if not isinstance(
           est, core_estimator.Estimator) else None
       export_strategy = saved_model_export_utils.make_export_strategy(
-          est, 'export_input', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
@@ -795,7 +851,9 @@ class ExperimentTest(test.TestCase):
   def test_test(self):
     for est in self._estimators_for_tests():
       exp_strategy = saved_model_export_utils.make_export_strategy(
-          est, 'export_input', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py
index 4b7867f2d0013012c7d988bbd84fa591942b7e04..98365c05f663e5d2a06703457fc5663d7135f7d9 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@@ -37,8 +37,8 @@ from tensorflow.python.client import session as tf_session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -429,11 +429,14 @@ def _get_ready_op():
 
 
 def _get_local_init_op():
+  """Returns the local init ops to initialize tables and local variables."""
   local_init_op = _get_first_op_from_collection(
       ops.GraphKeys.LOCAL_INIT_OP)
   if local_init_op is None:
-    op_list = [variables.local_variables_initializer(),
-               data_flow_ops.tables_initializer()]
+    op_list = [
+        variables.local_variables_initializer(),
+        lookup_ops.tables_initializer()
+    ]
     if op_list:
       local_init_op = control_flow_ops.group(*op_list)
       ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
@@ -680,7 +683,7 @@ def run_feeds_iter(output_dict, feed_dicts, restore_checkpoint_path=None):
       else:
         session.run(variables.global_variables_initializer())
       session.run(variables.local_variables_initializer())
-      session.run(data_flow_ops.tables_initializer())
+      session.run(lookup_ops.tables_initializer())
       coord = coordinator.Coordinator()
       threads = None
       try:
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
index 456792835827f86c0fbc42822e688240e6643ed4..06c3782a471537cf3879450e6bd20899a35d96ac 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
@@ -21,14 +21,14 @@ from __future__ import print_function
 from tensorflow.contrib.learn.python.learn.learn_io.dask_io import extract_dask_data
 from tensorflow.contrib.learn.python.learn.learn_io.dask_io import extract_dask_labels
 from tensorflow.contrib.learn.python.learn.learn_io.dask_io import HAS_DASK
-from tensorflow.contrib.learn.python.learn.learn_io.graph_io import _read_keyed_batch_examples_shared_queue
-from tensorflow.contrib.learn.python.learn.learn_io.graph_io import _read_keyed_batch_features_shared_queue
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import queue_parsed_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_examples
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_record_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_examples
+from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_examples_shared_queue
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_features
+from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_features_shared_queue
 from tensorflow.contrib.learn.python.learn.learn_io.numpy_io import numpy_input_fn
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_data
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_labels
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 9bdd3206b24fa592bcc2fbdd4f2eaa909f5357ee..6b552f59d080ab977876e5ff99628f51baab0856 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -174,17 +174,17 @@ def read_keyed_batch_examples(file_pattern,
       seed=seed)
 
 
-def _read_keyed_batch_examples_shared_queue(file_pattern,
-                                            batch_size,
-                                            reader,
-                                            randomize_input=True,
-                                            num_epochs=None,
-                                            queue_capacity=10000,
-                                            num_threads=1,
-                                            read_batch_size=1,
-                                            parse_fn=None,
-                                            name=None,
-                                            seed=None):
+def read_keyed_batch_examples_shared_queue(file_pattern,
+                                           batch_size,
+                                           reader,
+                                           randomize_input=True,
+                                           num_epochs=None,
+                                           queue_capacity=10000,
+                                           num_threads=1,
+                                           read_batch_size=1,
+                                           parse_fn=None,
+                                           name=None,
+                                           seed=None):
   """Adds operations to read, queue, batch `Example` protos.
 
   Given file pattern (or list of files), will setup a shared queue for file
@@ -512,18 +512,18 @@ def read_keyed_batch_features(file_pattern,
         name=scope)
 
 
-def _read_keyed_batch_features_shared_queue(file_pattern,
-                                            batch_size,
-                                            features,
-                                            reader,
-                                            randomize_input=True,
-                                            num_epochs=None,
-                                            queue_capacity=10000,
-                                            reader_num_threads=1,
-                                            feature_queue_capacity=100,
-                                            num_queue_runners=2,
-                                            parse_fn=None,
-                                            name=None):
+def read_keyed_batch_features_shared_queue(file_pattern,
+                                           batch_size,
+                                           features,
+                                           reader,
+                                           randomize_input=True,
+                                           num_epochs=None,
+                                           queue_capacity=10000,
+                                           reader_num_threads=1,
+                                           feature_queue_capacity=100,
+                                           num_queue_runners=2,
+                                           parse_fn=None,
+                                           name=None):
   """Adds operations to read, queue, batch and parse `Example` protos.
 
   Given file pattern (or list of files), will setup a shared queue for file
@@ -571,7 +571,7 @@ def _read_keyed_batch_features_shared_queue(file_pattern,
   """
 
   with ops.name_scope(name, 'read_batch_features', [file_pattern]) as scope:
-    keys, examples = _read_keyed_batch_examples_shared_queue(
+    keys, examples = read_keyed_batch_examples_shared_queue(
         file_pattern,
         batch_size,
         reader,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index 542aaabc953be5f7deb3a2e9349df72355d4eef1..f25f7caf61574f4d6cbd4d64b99a5d4f18b6fb44 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -26,7 +26,6 @@ import tempfile
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.learn.python.learn.learn_io import graph_io
-from tensorflow.contrib.learn.python.learn.learn_io.graph_io import _read_keyed_batch_examples_shared_queue
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
@@ -464,7 +463,7 @@ class GraphIOTest(test.TestCase):
     name = "my_batch"
 
     with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
-      keys, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=io_ops.TextLineReader,
@@ -528,7 +527,7 @@ class GraphIOTest(test.TestCase):
 
     with ops.Graph().as_default() as g1, session_lib.Session(
         server.target, graph=g1) as session:
-      keys, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=io_ops.TextLineReader,
@@ -557,7 +556,7 @@ class GraphIOTest(test.TestCase):
 
     with ops.Graph().as_default() as g2, session_lib.Session(
         server.target, graph=g2) as session:
-      keys, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=io_ops.TextLineReader,
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner.py b/tensorflow/contrib/learn/python/learn/learn_runner.py
index 983ac7462a2a5aa18e4043e7d6e34e97f8f698ec..a3398a87e1e44e64844446c378eea89cb5921404 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner.py
@@ -68,7 +68,8 @@ def _wrapped_experiment_fn_with_uid_check(experiment_fn, require_hparams=False):
     if not isinstance(run_config, run_config_lib.RunConfig):
       raise ValueError('`run_config` must be `RunConfig` instance')
     if not run_config.model_dir:
-      raise ValueError('Must specify a model directory in `run_config`.')
+      raise ValueError(
+          'Must specify a model directory `model_dir` in `run_config`.')
     if hparams is not None and not isinstance(hparams, hparam_lib.HParams):
       raise ValueError('`hparams` must be `HParams` instance')
     if require_hparams and hparams is None:
@@ -110,6 +111,10 @@ def run(experiment_fn, output_dir=None, schedule=None, run_config=None,
   Example with `run_config` (Recommended):
   ```
     def _create_my_experiment(run_config, hparams):
+
+        # You can change a subset of the run_config properties as
+        #   run_config = run_config.replace(save_checkpoints_steps=500)
+
         return tf.contrib.learn.Experiment(
           estimator=my_estimator(config=run_config, hparams=hparams),
           train_input_fn=my_train_input,
@@ -118,8 +123,17 @@ def run(experiment_fn, output_dir=None, schedule=None, run_config=None,
     learn_runner.run(
       experiment_fn=_create_my_experiment,
       run_config=run_config_lib.RunConfig(model_dir="some/output/dir"),
-      schedule="train",
+      schedule="train_and_evaluate",
       hparams=_create_default_hparams())
+  ```
+  or simply as
+  ```
+    learn_runner.run(
+      experiment_fn=_create_my_experiment,
+      run_config=run_config_lib.RunConfig(model_dir="some/output/dir"))
+  ```
+  if `hparams` is not used by the `Estimator`. On a single machine, `schedule`
+  defaults to `train_and_evaluate`.
 
   Example with `output_dir` (deprecated):
   ```
@@ -147,7 +161,8 @@ def run(experiment_fn, output_dir=None, schedule=None, run_config=None,
       It must return an `Experiment`. For this case, `output_dir` must be None.
     output_dir: Base output directory [Deprecated].
     schedule: The name of the  method in the `Experiment` to run.
-    run_config: `RunConfig` instance. If set, `output_dir` must be None.
+    run_config: `RunConfig` instance. The `run_config.model_dir` must be
+      non-empty. If `run_config` is set, `output_dir` must be None.
     hparams: `HParams` instance. The default hyper-parameters, which will be
       passed to the `experiment_fn` if `run_config` is not None.
 
@@ -157,8 +172,8 @@ def run(experiment_fn, output_dir=None, schedule=None, run_config=None,
   Raises:
     ValueError: If both `output_dir` and `run_config` are empty or set,
       `schedule` is None but no task type is set in the built experiment's
-      config, the task type has no default, or `schedule` doesn't reference a
-      member of `Experiment`.
+      config, the task type has no default, `run_config.model_dir` is empty or
+      `schedule` doesn't reference a member of `Experiment`.
     TypeError: `schedule` references non-callable member.
   """
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner_test.py b/tensorflow/contrib/learn/python/learn/learn_runner_test.py
index 6c8cde453f3a6b13ce9bef5e966964bf72157367..b61a42a1c762608df1344a5188176fab1dc25b65 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner_test.py
@@ -36,7 +36,8 @@ patch = test.mock.patch
 _MODIR_DIR = "/tmp"
 _HPARAMS = hparam_lib.HParams(learning_rate=0.01)
 _MUST_SPECIFY_OUTPUT_DIR_MSG = "Must specify an output directory"
-_MISSING_MODEL_DIR_ERR_MSG = "Must specify a model directory in `run_config`."
+_MISSING_MODEL_DIR_ERR_MSG = (
+    "Must specify a model directory `model_dir` in `run_config`.")
 _EXP_NOT_CALLABLE_MSG = "Experiment builder .* is not callable"
 _INVALID_HPARAMS_ERR_MSG = "`hparams` must be `HParams` instance"
 _NOT_EXP_TYPE_MSG = "Experiment builder did not return an Experiment"
@@ -293,8 +294,7 @@ class LearnRunnerRunWithRunConfigTest(test.TestCase):
     def _experiment_fn(run_config, hparams):
       del run_config, hparams  # unused.
       # Explicitly use a new run_config.
-      new_config = run_config_lib.RunConfig(
-          model_dir=_MODIR_DIR, save_checkpoints_steps=123)
+      new_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR + "/123")
 
       return TestExperiment(config=new_config)
 
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 9f13392666041a6f234087b1339506ae4b3047ef..e97992fd209ddd6ad6ada2baef406b059f834255 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -47,6 +47,7 @@ from tensorflow.contrib.learn.python.learn import session_run_hook
 from tensorflow.contrib.learn.python.learn.summary_writer_cache import SummaryWriterCache
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_lib
@@ -473,7 +474,7 @@ class LoggingTrainable(EveryN):
 
   def every_n_step_begin(self, step):
     super(LoggingTrainable, self).every_n_step_begin(step)
-    # Get a list of trainable variables at the begining of every N steps.
+    # Get a list of trainable variables at the beginning of every N steps.
     # We cannot get this in __init__ because train_op has not been generated.
     trainables = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,
                                     scope=self._scope)
@@ -634,6 +635,23 @@ class ValidationMonitor(EveryN):
     """Returns the best early stopping metric value found so far."""
     return self._best_value
 
+  def _evaluate_estimator(self):
+    if isinstance(self._estimator, core_estimator.Estimator):
+      if any((x is not None for x in
+              [self.x, self.y, self.batch_size, self.metrics])):
+        raise ValueError(
+            "tf.estimator.Estimator does not support following "
+            "arguments: x, y, batch_size, metrics. Should set as `None` "
+            "in ValidationMonitor")
+      return self._estimator.evaluate(
+          input_fn=self.input_fn, steps=self.eval_steps, hooks=self.hooks,
+          name=self.name)
+    else:
+      return self._estimator.evaluate(
+          x=self.x, y=self.y, input_fn=self.input_fn,
+          batch_size=self.batch_size, steps=self.eval_steps,
+          metrics=self.metrics, hooks=self.hooks, name=self.name)
+
   def every_n_step_end(self, step, outputs):
     super(ValidationMonitor, self).every_n_step_end(step, outputs)
     # TODO(mdan): The use of step below is probably misleading.
@@ -656,10 +674,7 @@ class ValidationMonitor(EveryN):
     self._latest_path_step = step
 
     # Run evaluation and log it.
-    validation_outputs = self._estimator.evaluate(
-        x=self.x, y=self.y, input_fn=self.input_fn, batch_size=self.batch_size,
-        steps=self.eval_steps, metrics=self.metrics, hooks=self.hooks,
-        name=self.name)
+    validation_outputs = self._evaluate_estimator()
     stats = []
     for name in validation_outputs:
       stats.append("%s = %s" % (name, str(validation_outputs[name])))
@@ -919,6 +934,10 @@ class ExportMonitor(EveryN):
   def every_n_step_end(self, step, outputs):
     super(ExportMonitor, self).every_n_step_end(step, outputs)
     try:
+      if isinstance(self._estimator, core_estimator.Estimator):
+        raise ValueError(
+            "ExportMonitor does not support `tf.estimator.Estimator. `. "
+            "Please pass an ExportStrategy to Experiment instead.")
       self._last_export_dir = self._estimator.export(
           self.export_dir,
           exports_to_keep=self.exports_to_keep,
@@ -946,6 +965,10 @@ class ExportMonitor(EveryN):
       logging.info("Skipping export at the end since model has not been saved "
                    "yet.")
       return
+    if isinstance(self._estimator, core_estimator.Estimator):
+      raise ValueError(
+          "ExportMonitor does not support `tf.estimator.Estimator. `. "
+          "Please pass an ExportStrategy to Experiment instead.")
     try:
       self._last_export_dir = self._estimator.export(
           self.export_dir,
diff --git a/tensorflow/contrib/learn/python/learn/monitors_test.py b/tensorflow/contrib/learn/python/learn/monitors_test.py
index f9ee03c94437e8d5e671b418a90c6a95e2037c40..221d5f1fef6b4a887e7d8f9f041d66db44b47e3e 100644
--- a/tensorflow/contrib/learn/python/learn/monitors_test.py
+++ b/tensorflow/contrib/learn/python/learn/monitors_test.py
@@ -31,6 +31,7 @@ from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn import estimators
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
@@ -449,6 +450,62 @@ class MonitorsTest(test.TestCase):
       monitor.epoch_end(epoch=0)
       monitor.end()
 
+  @test.mock.patch.object(saver, 'latest_checkpoint')
+  def test_validation_monitor_with_core_estimator(self, mock_latest_checkpoint):
+    estimator = test.mock.Mock(spec=core_estimator.Estimator)
+    model_dir = 'model/dir'
+    estimator.model_dir = model_dir
+    validation_outputs = {'loss': None}
+    estimator.evaluate.return_value = validation_outputs
+
+    monitor = learn.monitors.ValidationMonitor(
+        input_fn=lambda: constant_op.constant(2.0),
+        every_n_steps=0, early_stopping_rounds=2)
+    self._assert_validation_monitor(monitor)
+    monitor.set_estimator(estimator)
+    with ops.Graph().as_default() as g, self.test_session(g):
+      monitor.begin(max_steps=100)
+      monitor.epoch_begin(epoch=0)
+      self.assertEqual(0, estimator.evaluate.call_count)
+
+      # Step 0, initial loss.
+      step = 0
+      mock_latest_checkpoint.return_value = '%s/ckpt.%s' % (model_dir, step)
+      validation_outputs['loss'] = 42.0
+      self.assertEqual(0, len(monitor.step_begin(step=step)))
+      self.assertFalse(monitor.step_end(step=step, output={}))
+      self.assertEqual(1, estimator.evaluate.call_count)
+      self._assert_validation_monitor(
+          monitor, expected_best_step=0, expected_best_value=42.0)
+      monitor.post_step(step=step, session=None)
+
+  @test.mock.patch.object(saver, 'latest_checkpoint')
+  def test_validation_monitor_fail_with_core_estimator_and_metrics(
+      self, mock_latest_checkpoint):
+    estimator = test.mock.Mock(spec=core_estimator.Estimator)
+    model_dir = 'model/dir'
+    estimator.model_dir = model_dir
+    validation_outputs = {'loss': None}
+    estimator.evaluate.return_value = validation_outputs
+
+    monitor = learn.monitors.ValidationMonitor(
+        input_fn=lambda: constant_op.constant(2.0),
+        metrics=constant_op.constant(2.0),
+        every_n_steps=0, early_stopping_rounds=2)
+    monitor.set_estimator(estimator)
+    with ops.Graph().as_default() as g, self.test_session(g):
+      monitor.begin(max_steps=100)
+      monitor.epoch_begin(epoch=0)
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          'tf.estimator.Estimator does not support .* metrics'):
+        step = 0
+        mock_latest_checkpoint.return_value = '%s/ckpt.%s' % (model_dir, step)
+        validation_outputs['loss'] = 42.0
+        self.assertEqual(0, len(monitor.step_begin(step=step)))
+        self.assertFalse(monitor.step_end(step=step, output={}))
+
   def test_graph_dump(self):
     monitor0 = learn.monitors.GraphDump()
     monitor1 = learn.monitors.GraphDump()
diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py
index b53be292830c00eb4eb03cdd2cd0965b790aa170..36a1f5f60cddec4102aed057fac097eabfc249a7 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as tf_saver
@@ -67,17 +67,17 @@ def _export_graph(graph, saver, checkpoint_path, export_dir,
   with graph.as_default():
     with tf_session.Session('') as session:
       variables.local_variables_initializer()
-      data_flow_ops.tables_initializer()
+      lookup_ops.tables_initializer()
       saver.restore(session, checkpoint_path)
 
       export = exporter.Exporter(saver)
-      export.init(init_op=control_flow_ops.group(
-          variables.local_variables_initializer(),
-          data_flow_ops.tables_initializer()),
-                  default_graph_signature=default_graph_signature,
-                  named_graph_signatures=named_graph_signatures,
-                  assets_collection=ops.get_collection(
-                      ops.GraphKeys.ASSET_FILEPATHS))
+      export.init(
+          init_op=control_flow_ops.group(
+              variables.local_variables_initializer(),
+              lookup_ops.tables_initializer()),
+          default_graph_signature=default_graph_signature,
+          named_graph_signatures=named_graph_signatures,
+          assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))
       return export.export(export_dir, contrib_variables.get_global_step(),
                            session, exports_to_keep=exports_to_keep)
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 7ad3779314bab8be458cc078a9449048c56a9ffe..fa314e69c7adc9e707b49922de6e932797a6facf 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -42,6 +42,7 @@ from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import gc
 from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.platform import gfile
@@ -352,7 +353,8 @@ def make_export_strategy(serving_input_fn,
       `InputFnOps`.
     default_output_alternative_key: the name of the head to serve when an
       incoming serving request does not explicitly request a specific head.
-      Not needed for single-headed models.
+      Must be `None` if the estimator inherits from ${tf.estimator.Estimator}
+      or for single-headed models.
     assets_extra: A dict specifying how to populate the assets.extra directory
       within the exported SavedModel.  Each key should give the destination
       path (including the filename) relative to the assets.extra directory.
@@ -384,14 +386,30 @@ def make_export_strategy(serving_input_fn,
 
     Returns:
       The string path to the exported directory.
+
+    Raises:
+      ValueError: If `estimator` is a ${tf.estimator.Estimator} instance
+        and `default_output_alternative_key` was specified.
     """
-    export_result = estimator.export_savedmodel(
-        export_dir_base,
-        serving_input_fn,
-        default_output_alternative_key=default_output_alternative_key,
-        assets_extra=assets_extra,
-        as_text=as_text,
-        checkpoint_path=checkpoint_path)
+    if isinstance(estimator, core_estimator.Estimator):
+      if default_output_alternative_key is not None:
+        raise ValueError(
+            'default_output_alternative_key is not supported in core '
+            'Estimator. Given: {}'.format(default_output_alternative_key))
+      export_result = estimator.export_savedmodel(
+          export_dir_base,
+          serving_input_fn,
+          assets_extra=assets_extra,
+          as_text=as_text,
+          checkpoint_path=checkpoint_path)
+    else:
+      export_result = estimator.export_savedmodel(
+          export_dir_base,
+          serving_input_fn,
+          default_output_alternative_key=default_output_alternative_key,
+          assets_extra=assets_extra,
+          as_text=as_text,
+          checkpoint_path=checkpoint_path)
 
     garbage_collect_exports(export_dir_base, exports_to_keep)
     return export_result
diff --git a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
index 8dcfb775b2711a356c2ff31f95019bfa5df08ee7..2898935a47892aea78c408d5d8d22b7a29908e5f 100644
--- a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
+++ b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
@@ -942,8 +942,8 @@ class Seq2SeqTest(test.TestCase):
         perplexities[bucket].append(math.exp(float(res[1])))
       for bucket in range(len(buckets)):
         if len(perplexities[bucket]) > 1:  # Assert that perplexity went down.
-          self.assertLess(perplexities[bucket][-1],  # 10% margin of error.
-                          1.1 * perplexities[bucket][0])
+          self.assertLess(perplexities[bucket][-1],  # 20% margin of error.
+                          1.2 * perplexities[bucket][0])
 
   def testModelWithBooleanFeedPrevious(self):
     """Test the model behavior when feed_previous is True.
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
index 998073e28bd36f8e2ae5ef5e547f302225886f51..e2a7f5fbe10caaf578134dbea4395fd19f1a3a96 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
@@ -65,18 +65,21 @@ class SquareLinearOperatorCompositionTest(
       # feed_dict.
       matrices = sess.run(matrices)
       operator = linalg.LinearOperatorComposition(
-          [linalg.LinearOperatorFullMatrix(m_ph) for m_ph in matrices_ph])
+          [linalg.LinearOperatorFullMatrix(m_ph) for m_ph in matrices_ph],
+          is_square=True)
       feed_dict = {m_ph: m for (m_ph, m) in zip(matrices_ph, matrices)}
     else:
       operator = linalg.LinearOperatorComposition(
           [linalg.LinearOperatorFullMatrix(m) for m in matrices])
       feed_dict = None
+      # Should be auto-set.
+      self.assertTrue(operator.is_square)
 
     # Convert back to Tensor.  Needed if use_placeholder, since then we have
     # already evaluated each matrix to a numpy array.
-    apply_order_list = list(reversed(matrices))
-    mat = ops.convert_to_tensor(apply_order_list[0])
-    for other_mat in apply_order_list[1:]:
+    matmul_order_list = list(reversed(matrices))
+    mat = ops.convert_to_tensor(matmul_order_list[0])
+    for other_mat in matmul_order_list[1:]:
       mat = math_ops.matmul(other_mat, mat)
 
     return operator, mat, feed_dict
@@ -185,9 +188,9 @@ class NonSquareLinearOperatorCompositionTest(
 
     # Convert back to Tensor.  Needed if use_placeholder, since then we have
     # already evaluated each matrix to a numpy array.
-    apply_order_list = list(reversed(matrices))
-    mat = ops.convert_to_tensor(apply_order_list[0])
-    for other_mat in apply_order_list[1:]:
+    matmul_order_list = list(reversed(matrices))
+    mat = ops.convert_to_tensor(matmul_order_list[0])
+    for other_mat in matmul_order_list[1:]:
       mat = math_ops.matmul(other_mat, mat)
 
     return operator, mat, feed_dict
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
index 3bb81a4333cf678153d5643d79359021e6614df8..397bfa22156e2f9398180b8fa57f34a10334906d 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
@@ -122,7 +122,7 @@ class LinearOperatorDiagTest(
     with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
       linalg.LinearOperatorDiag(1.)
 
-  def test_broadcast_apply_and_solve(self):
+  def test_broadcast_matmul_and_solve(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.matmul cannot handle.
     # In particular, tf.matmul does not broadcast.
@@ -130,7 +130,7 @@ class LinearOperatorDiagTest(
       x = random_ops.random_normal(shape=(2, 2, 3, 4))
 
       # This LinearOperatorDiag will be brodacast to (2, 2, 3, 3) during solve
-      # and apply with 'x' as the argument.
+      # and matmul with 'x' as the argument.
       diag = random_ops.random_uniform(shape=(2, 1, 3))
       operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
       self.assertAllEqual((2, 1, 3, 3), operator.shape)
@@ -140,10 +140,10 @@ class LinearOperatorDiagTest(
       mat = array_ops.matrix_diag(diag_broadcast)
       self.assertAllEqual((2, 2, 3, 3), mat.get_shape())  # being pedantic.
 
-      operator_apply = operator.apply(x)
-      mat_apply = math_ops.matmul(mat, x)
-      self.assertAllEqual(operator_apply.get_shape(), mat_apply.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, mat_apply]))
+      operator_matmul = operator.matmul(x)
+      mat_matmul = math_ops.matmul(mat, x)
+      self.assertAllEqual(operator_matmul.get_shape(), mat_matmul.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, mat_matmul]))
 
       operator_solve = operator.solve(x)
       mat_solve = linalg_ops.matrix_solve(mat, x)
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
index d4a9e97ce7a86510b805af8b9f9cbd633147891e..528bc3ed124e96fe4630a3a99beb8c18635b6f8e 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
@@ -17,12 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib import linalg as linalg_lib
 from tensorflow.contrib.linalg.python.ops import linear_operator_test_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
@@ -45,9 +48,10 @@ class SquareLinearOperatorFullMatrixTest(
       # values are random and we want the same value used for both mat and
       # feed_dict.
       matrix = matrix.eval()
-      operator = linalg.LinearOperatorFullMatrix(matrix_ph)
+      operator = linalg.LinearOperatorFullMatrix(matrix_ph, is_square=True)
       feed_dict = {matrix_ph: matrix}
     else:
+      # is_square should be auto-detected here.
       operator = linalg.LinearOperatorFullMatrix(matrix)
       feed_dict = None
 
@@ -68,6 +72,46 @@ class SquareLinearOperatorFullMatrixTest(
     self.assertTrue(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
+    # Auto-detected.
+    self.assertTrue(operator.is_square)
+
+  def test_assert_non_singular_raises_if_cond_too_big_but_finite(self):
+    with self.test_session():
+      tril = linear_operator_test_util.random_tril_matrix(
+          shape=(50, 50), dtype=np.float32)
+      diag = np.logspace(-2, 2, 50).astype(np.float32)
+      tril = array_ops.matrix_set_diag(tril, diag)
+      matrix = math_ops.matmul(tril, tril, transpose_b=True).eval()
+      operator = linalg.LinearOperatorFullMatrix(matrix)
+      with self.assertRaisesOpError("Singular matrix"):
+        # Ensure that we have finite condition number...just HUGE.
+        cond = np.linalg.cond(matrix)
+        self.assertTrue(np.isfinite(cond))
+        self.assertGreater(cond, 1e12)
+        operator.assert_non_singular().run()
+
+  def test_assert_non_singular_raises_if_cond_infinite(self):
+    with self.test_session():
+      matrix = [[1., 1.], [1., 1.]]
+      # We don't pass the is_self_adjoint hint here, which means we take the
+      # generic code path.
+      operator = linalg.LinearOperatorFullMatrix(matrix)
+      with self.assertRaisesOpError("Singular matrix"):
+        operator.assert_non_singular().run()
+
+  def test_assert_self_adjoint(self):
+    matrix = [[0., 1.], [0., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    with self.test_session():
+      with self.assertRaisesOpError("not equal to its adjoint"):
+        operator.assert_self_adjoint().run()
+
+  def test_assert_positive_definite(self):
+    matrix = [[1., 1.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=True)
+    with self.test_session():
+      with self.assertRaisesOpError("Cholesky decomposition was not success"):
+        operator.assert_positive_definite().run()
 
 
 class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
@@ -104,6 +148,7 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
       # values are random and we want the same value used for both mat and
       # feed_dict.
       matrix = matrix.eval()
+      # is_square is auto-set because of self_adjoint/pd.
       operator = linalg.LinearOperatorFullMatrix(
           matrix_ph, is_self_adjoint=True, is_positive_definite=True)
       feed_dict = {matrix_ph: matrix}
@@ -129,7 +174,36 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
 
     # Should be auto-set
     self.assertTrue(operator.is_non_singular)
-    self.assertTrue(operator._is_spd)
+    self.assertTrue(operator._can_use_cholesky)
+    self.assertTrue(operator.is_square)
+
+  def test_assert_non_singular(self):
+    matrix = [[1., 1.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_self_adjoint=True, is_positive_definite=True)
+    with self.test_session():
+      # Cholesky decomposition may fail, so the error is not specific to
+      # non-singular.
+      with self.assertRaisesOpError(""):
+        operator.assert_non_singular().run()
+
+  def test_assert_self_adjoint(self):
+    matrix = [[0., 1.], [0., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_self_adjoint=True, is_positive_definite=True)
+    with self.test_session():
+      with self.assertRaisesOpError("not equal to its adjoint"):
+        operator.assert_self_adjoint().run()
+
+  def test_assert_positive_definite(self):
+    matrix = [[1., 1.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_self_adjoint=True, is_positive_definite=True)
+    with self.test_session():
+      # Cholesky decomposition may fail, so the error is not specific to
+      # non-singular.
+      with self.assertRaisesOpError(""):
+        operator.assert_positive_definite().run()
 
 
 class NonSquareLinearOperatorFullMatrixTest(
@@ -157,16 +231,14 @@ class NonSquareLinearOperatorFullMatrixTest(
     return operator, mat, feed_dict
 
   def test_is_x_flags(self):
-    # Matrix with two positive eigenvalues.
-    matrix = [[3., 0.], [1., 1.]]
+    matrix = [[3., 2., 1.], [1., 1., 1.]]
     operator = linalg.LinearOperatorFullMatrix(
         matrix,
-        is_positive_definite=True,
-        is_non_singular=True,
         is_self_adjoint=False)
-    self.assertTrue(operator.is_positive_definite)
-    self.assertTrue(operator.is_non_singular)
+    self.assertEqual(operator.is_positive_definite, None)
+    self.assertEqual(operator.is_non_singular, None)
     self.assertFalse(operator.is_self_adjoint)
+    self.assertFalse(operator.is_square)
 
   def test_matrix_must_have_at_least_two_dims_or_raises(self):
     with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py
index 36a255f3d506c96a2f67263c383e0f763cf47ccb..5faf2c432b6610863864717fd5f693b1aa781915 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py
@@ -77,14 +77,14 @@ class LinearOperatorIdentityTest(
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_self_adjoint().run()  # Should not fail
 
-  def test_float16_apply(self):
+  def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.matrix_solve does
     # not work with float16.
     with self.test_session():
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows=2, dtype=dtypes.float16)
       x = rng.randn(2, 3).astype(np.float16)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       self.assertAllClose(x, y.eval())
 
   def test_non_scalar_num_rows_raises_static(self):
@@ -147,7 +147,7 @@ class LinearOperatorIdentityTest(
     operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
     x = rng.randn(3, 3).astype(np.float32)
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
-      operator.apply(x)
+      operator.matmul(x)
 
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
@@ -156,7 +156,7 @@ class LinearOperatorIdentityTest(
     with self.test_session():
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows, assert_proper_shapes=True)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       with self.assertRaisesOpError("Incompatible.*dimensions"):
         y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
 
@@ -168,11 +168,11 @@ class LinearOperatorIdentityTest(
       x = random_ops.random_normal(shape=(1, 2, 3, 4))
       operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype)
 
-      operator_apply = operator.apply(x)
+      operator_matmul = operator.matmul(x)
       expected = x
 
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
   def test_default_batch_shape_broadcasts_with_everything_dynamic(self):
     # These cannot be done in the automated (base test class) tests since they
@@ -182,15 +182,15 @@ class LinearOperatorIdentityTest(
       x = array_ops.placeholder(dtypes.float32)
       operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype)
 
-      operator_apply = operator.apply(x)
+      operator_matmul = operator.matmul(x)
       expected = x
 
       feed_dict = {x: rng.randn(1, 2, 3, 4)}
 
       self.assertAllClose(
-          *sess.run([operator_apply, expected], feed_dict=feed_dict))
+          *sess.run([operator_matmul, expected], feed_dict=feed_dict))
 
-  def test_broadcast_apply_static_shapes(self):
+  def test_broadcast_matmul_static_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -204,14 +204,14 @@ class LinearOperatorIdentityTest(
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
 
-      # Expected result of apply and solve.
+      # Expected result of matmul and solve.
       expected = x + zeros
 
-      operator_apply = operator.apply(x)
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      operator_matmul = operator.matmul(x)
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
-  def test_broadcast_apply_dynamic_shapes(self):
+  def test_broadcast_matmul_dynamic_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -229,12 +229,12 @@ class LinearOperatorIdentityTest(
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
 
-      # Expected result of apply and solve.
+      # Expected result of matmul and solve.
       expected = x + zeros
 
-      operator_apply = operator.apply(x)
+      operator_matmul = operator.matmul(x)
       self.assertAllClose(
-          *sess.run([operator_apply, expected], feed_dict=feed_dict))
+          *sess.run([operator_matmul, expected], feed_dict=feed_dict))
 
   def test_is_x_flags(self):
     # The is_x flags are by default all True.
@@ -332,7 +332,7 @@ class LinearOperatorScaledIdentityTest(
       with self.assertRaisesOpError("not self-adjoint"):
         operator.assert_self_adjoint().run()
 
-  def test_float16_apply(self):
+  def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.matrix_solve does
     # not work with float16.
     with self.test_session():
@@ -340,7 +340,7 @@ class LinearOperatorScaledIdentityTest(
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=multiplier)
       x = rng.randn(2, 3).astype(np.float16)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       self.assertAllClose(multiplier[..., None, None] * x, y.eval())
 
   def test_non_scalar_num_rows_raises_static(self):
@@ -354,7 +354,7 @@ class LinearOperatorScaledIdentityTest(
         num_rows=2, multiplier=2.2)
     x = rng.randn(3, 3).astype(np.float32)
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
-      operator.apply(x)
+      operator.matmul(x)
 
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
@@ -363,11 +363,11 @@ class LinearOperatorScaledIdentityTest(
     with self.test_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows, multiplier=[1., 2], assert_proper_shapes=True)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       with self.assertRaisesOpError("Incompatible.*dimensions"):
         y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
 
-  def test_broadcast_apply_and_solve(self):
+  def test_broadcast_matmul_and_solve(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -383,11 +383,11 @@ class LinearOperatorScaledIdentityTest(
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
 
-      # Test apply
+      # Test matmul
       expected = x * 2.2 + zeros
-      operator_apply = operator.apply(x)
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      operator_matmul = operator.matmul(x)
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2 + zeros
@@ -395,7 +395,7 @@ class LinearOperatorScaledIdentityTest(
       self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
       self.assertAllClose(*sess.run([operator_solve, expected]))
 
-  def test_broadcast_apply_and_solve_scalar_scale_multiplier(self):
+  def test_broadcast_matmul_and_solve_scalar_scale_multiplier(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -409,11 +409,11 @@ class LinearOperatorScaledIdentityTest(
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=3, multiplier=2.2)
 
-      # Test apply
+      # Test matmul
       expected = x * 2.2
-      operator_apply = operator.apply(x)
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      operator_matmul = operator.matmul(x)
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
index c5bfc6e1fd57de54dc0ddbdecbefbce059b6c2f4..78a4822c177c8d36fcbe82d3b557b2e6cb3630af 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -54,9 +55,12 @@ class LinearOperatorShape(linalg.LinearOperator):
   def _shape_tensor(self):
     return constant_op.constant(self._stored_shape, dtype=dtypes.int32)
 
+  def _matmul(self):
+    raise NotImplementedError("Not needed for this test.")
 
-class LinearOperatorApplyOnly(linalg.LinearOperator):
-  """LinearOperator that simply wraps a [batch] matrix and implements apply."""
+
+class LinearOperatorMatmulSolve(linalg.LinearOperator):
+  """LinearOperator that wraps a [batch] matrix and implements matmul/solve."""
 
   def __init__(self,
                matrix,
@@ -65,8 +69,8 @@ class LinearOperatorApplyOnly(linalg.LinearOperator):
                is_positive_definite=None,
                is_square=None):
     self._matrix = ops.convert_to_tensor(matrix, name="matrix")
-    super(LinearOperatorApplyOnly, self).__init__(
-        dtype=matrix.dtype,
+    super(LinearOperatorMatmulSolve, self).__init__(
+        dtype=self._matrix.dtype,
         is_non_singular=is_non_singular,
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
@@ -78,10 +82,16 @@ class LinearOperatorApplyOnly(linalg.LinearOperator):
   def _shape_tensor(self):
     return array_ops.shape(self._matrix)
 
-  def _apply(self, x, adjoint=False, adjoint_arg=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    x = ops.convert_to_tensor(x, name="x")
     return math_ops.matmul(
         self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    rhs = ops.convert_to_tensor(rhs, name="rhs")
+    assert not adjoint_arg, "Not implemented for this test class."
+    return linalg_ops.matrix_solve(self._matrix, rhs, adjoint=adjoint)
+
 
 class LinearOperatorTest(test.TestCase):
 
@@ -119,7 +129,7 @@ class LinearOperatorTest(test.TestCase):
 
   def test_generic_to_dense_method_non_square_matrix_static(self):
     matrix = rng.randn(2, 3, 4)
-    operator = LinearOperatorApplyOnly(matrix)
+    operator = LinearOperatorMatmulSolve(matrix)
     with self.test_session():
       operator_dense = operator.to_dense()
       self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
@@ -128,12 +138,30 @@ class LinearOperatorTest(test.TestCase):
   def test_generic_to_dense_method_non_square_matrix_tensor(self):
     matrix = rng.randn(2, 3, 4)
     matrix_ph = array_ops.placeholder(dtypes.float64)
-    operator = LinearOperatorApplyOnly(matrix_ph)
+    operator = LinearOperatorMatmulSolve(matrix_ph)
     with self.test_session():
       operator_dense = operator.to_dense()
       self.assertAllClose(
           matrix, operator_dense.eval(feed_dict={matrix_ph: matrix}))
 
+  def test_matvec(self):
+    matrix = [[1., 0], [0., 2.]]
+    operator = LinearOperatorMatmulSolve(matrix)
+    x = [1., 1.]
+    with self.test_session():
+      y = operator.matvec(x)
+      self.assertAllEqual((2,), y.get_shape())
+      self.assertAllClose([1., 2.], y.eval())
+
+  def test_solvevec(self):
+    matrix = [[1., 0], [0., 2.]]
+    operator = LinearOperatorMatmulSolve(matrix)
+    y = [1., 1.]
+    with self.test_session():
+      x = operator.solvevec(y)
+      self.assertAllEqual((2,), x.get_shape())
+      self.assertAllClose([1., 1 / 2.], x.eval())
+
   def test_is_square_set_to_true_for_square_static_shapes(self):
     operator = LinearOperatorShape(shape=(2, 4, 4))
     self.assertTrue(operator.is_square)
@@ -149,11 +177,11 @@ class LinearOperatorTest(test.TestCase):
   def test_is_square_set_inconsistent_with_other_hints_raises(self):
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
-      LinearOperatorApplyOnly(matrix, is_non_singular=True, is_square=False)
+      LinearOperatorMatmulSolve(matrix, is_non_singular=True, is_square=False)
 
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
-      LinearOperatorApplyOnly(
+      LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
   def test_non_square_operators_raise_on_determinant_and_solve(self):
@@ -167,16 +195,16 @@ class LinearOperatorTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
-      LinearOperatorApplyOnly(
+      LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
   def test_is_square_manual_set_works(self):
     matrix = array_ops.placeholder(dtypes.float32)
     # Default is None.
-    operator = LinearOperatorApplyOnly(matrix)
+    operator = LinearOperatorMatmulSolve(matrix)
     self.assertEqual(None, operator.is_square)
     # Set to True
-    operator = LinearOperatorApplyOnly(matrix, is_square=True)
+    operator = LinearOperatorMatmulSolve(matrix, is_square=True)
     self.assertTrue(operator.is_square)
 
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator.py b/tensorflow/contrib/linalg/python/ops/linear_operator.py
index 454411d93cf94bf6222006c81017808d6c807618..6cdfa8618932d0e9ae1198d68e78f36583022390 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator.py
@@ -18,13 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import contextlib
 
+import numpy as np
+
 from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
 
 __all__ = ["LinearOperator"]
 
@@ -45,16 +51,14 @@ class LinearOperator(object):
   To enable a public method, subclasses should implement the leading-underscore
   version of the method.  The argument signature should be identical except for
   the omission of `name="..."`.  For example, to enable
-  `apply(x, adjoint=False, name="apply")` a subclass should implement
-  `_apply(x, adjoint=False)`.
+  `matmul(x, adjoint=False, name="matmul")` a subclass should implement
+  `_matmul(x, adjoint=False)`.
 
   #### Performance contract
 
-  Subclasses should implement a method only if it can be done with a reasonable
-  performance increase over generic dense operations, either in time, parallel
-  scalability, or memory usage.  For example, if the determinant can only be
-  computed using `tf.matrix_determinant(self.to_dense())`, then determinants
-  should not be implemented.
+  Subclasses should only implement the assert methods
+  (e.g. `assert_non_singular`) if they can be done in less than `O(N^3)`
+  time.
 
   Class docstrings should contain an explanation of computational complexity.
   Since this is a high-performance library, attention should be paid to detail,
@@ -68,7 +72,7 @@ class LinearOperator(object):
 
   An example is:
 
-  `x` is a batch matrix with compatible shape for `apply` if
+  `x` is a batch matrix with compatible shape for `matmul` if
 
   ```
   operator.shape = [B1,...,Bb] + [M, N],  b >= 0,
@@ -100,12 +104,12 @@ class LinearOperator(object):
   operator.shape()
   ==> [2, 4, 4]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> Shape [2] Tensor
 
   x = ... Shape [2, 4, 5] Tensor
 
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4, 5] Tensor
   ```
 
@@ -131,6 +135,7 @@ class LinearOperator(object):
   * If `is_X == None` (the default), callers should have no expectation either
     way.
   """
+  __metaclass__ = abc.ABCMeta
 
   def __init__(self,
                dtype,
@@ -146,7 +151,7 @@ class LinearOperator(object):
     **Subclasses should copy-paste this `__init__` documentation.**
 
     Args:
-      dtype: The type of the this `LinearOperator`.  Arguments to `apply` and
+      dtype: The type of the this `LinearOperator`.  Arguments to `matmul` and
         `solve` will have to be this type.
       graph_parents: Python list of graph prerequisites of this `LinearOperator`
         Typically tensors that are passed during initialization.
@@ -167,17 +172,23 @@ class LinearOperator(object):
       ValueError:  If hints are set incorrectly.
     """
     # Check and auto-set flags.
-    if is_square is False:
-      if is_non_singular or is_positive_definite:
-        raise ValueError(
-            "A non-singular or positive definite operator is always square.")
-    self._is_square_set_by_user = is_square
-
     if is_positive_definite:
       if is_non_singular is False:
         raise ValueError("A positive definite matrix is always non-singular.")
       is_non_singular = True
 
+    if is_non_singular:
+      if is_square is False:
+        raise ValueError("A non-singular matrix is always square.")
+      is_square = True
+
+    if is_self_adjoint:
+      if is_square is False:
+        raise ValueError("A self-adjoint matrix is always square.")
+      is_square = True
+
+    self._is_square_set_or_implied_by_hints = is_square
+
     graph_parents = [] if graph_parents is None else graph_parents
     for i, t in enumerate(graph_parents):
       if t is None or not contrib_framework.is_tensor(t):
@@ -239,15 +250,16 @@ class LinearOperator(object):
     """Return `True/False` depending on if this operator is square."""
     # Static checks done after __init__.  Why?  Because domain/range dimension
     # sometimes requires lots of work done in the derived class after init.
-    static_square_check = self.domain_dimension == self.range_dimension
-    if self._is_square_set_by_user is False and static_square_check:
+    auto_square_check = self.domain_dimension == self.range_dimension
+    if self._is_square_set_or_implied_by_hints is False and auto_square_check:
       raise ValueError(
           "User set is_square hint to False, but the operator was square.")
-    if self._is_square_set_by_user is None:
-      return static_square_check
+    if self._is_square_set_or_implied_by_hints is None:
+      return auto_square_check
 
-    return self._is_square_set_by_user
+    return self._is_square_set_or_implied_by_hints
 
+  @abc.abstractmethod
   def _shape(self):
     # Write this in derived class to enable all static shape methods.
     raise NotImplementedError("_shape is not implemented.")
@@ -265,6 +277,7 @@ class LinearOperator(object):
     """
     return self._shape()
 
+  @abc.abstractmethod
   def _shape_tensor(self):
     raise NotImplementedError("_shape_tensor is not implemented.")
 
@@ -367,8 +380,7 @@ class LinearOperator(object):
           self._cached_tensor_rank_tensor = ops.convert_to_tensor(
               self.tensor_rank)
         else:
-          self._cached_tensor_rank_tensor = array_ops.size(
-              self.shape_tensor())
+          self._cached_tensor_rank_tensor = array_ops.size(self.shape_tensor())
       return self._cached_tensor_rank_tensor
 
   @property
@@ -448,14 +460,70 @@ class LinearOperator(object):
       return self._cached_range_dimension_tensor
 
   def _assert_non_singular(self):
+    """Private default implementation of _assert_non_singular."""
+    logging.warn(
+        "Using (possibly slow) default implementation of assert_non_singular."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    if self._can_use_cholesky():
+      return self.assert_positive_definite()
+    else:
+      singular_values = linalg_ops.svd(
+          self._get_cached_dense_matrix(), compute_uv=False)
+      # TODO(langmore) Add .eig and .cond as methods.
+      cond = (math_ops.reduce_max(singular_values, axis=-1) /
+              math_ops.reduce_min(singular_values, axis=-1))
+      return check_ops.assert_less(
+          cond,
+          self._max_condition_number_to_be_non_singular(),
+          message="Singular matrix up to precision epsilon.")
     raise NotImplementedError("assert_non_singular is not implemented.")
 
+  def _max_condition_number_to_be_non_singular(self):
+    """Return the maximum condition number that we consider nonsingular."""
+    with ops.name_scope("max_nonsingular_condition_number"):
+      dtype_eps = np.finfo(self.dtype.as_numpy_dtype).eps
+      eps = math_ops.cast(
+          math_ops.reduce_max([
+              100.,
+              math_ops.cast(self.range_dimension_tensor(), self.dtype),
+              math_ops.cast(self.domain_dimension_tensor(), self.dtype)
+          ]), self.dtype) * dtype_eps
+      return 1. / eps
+
   def assert_non_singular(self, name="assert_non_singular"):
-    """Returns an `Op` that asserts this operator is non singular."""
+    """Returns an `Op` that asserts this operator is non singular.
+
+    This operator is considered non-singular if
+
+    ```
+    ConditionNumber < max{100, range_dimension, domain_dimension} * eps,
+    eps := np.finfo(self.dtype.as_numpy_dtype).eps
+    ```
+
+    Args:
+      name:  A string name to prepend to created ops.
+
+    Returns:
+      An `Assert` `Op`, that, when run, will raise an `InvalidArgumentError` if
+        the operator is singular.
+    """
     with self._name_scope(name):
       return self._assert_non_singular()
 
   def _assert_positive_definite(self):
+    """Default implementation of _assert_positive_definite."""
+    logging.warn(
+        "Using (possibly slow) default implementation of "
+        "assert_positive_definite."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    # If the operator is self-adjoint, then checking that
+    # Cholesky decomposition succeeds + results in positive diag is necessary
+    # and sufficient.
+    if self.is_self_adjoint:
+      return check_ops.assert_positive(
+          array_ops.matrix_diag_part(self._get_cached_chol()),
+          message="Matrix was not positive definite.")
+    # We have no generic check for positive definite.
     raise NotImplementedError("assert_positive_definite is not implemented.")
 
   def assert_positive_definite(self, name="assert_positive_definite"):
@@ -469,16 +537,35 @@ class LinearOperator(object):
       name:  A name to give this `Op`.
 
     Returns:
-      An `Op` that asserts this operator is positive definite.
+      An `Assert` `Op`, that, when run, will raise an `InvalidArgumentError` if
+        the operator is not positive definite.
     """
     with self._name_scope(name):
       return self._assert_positive_definite()
 
   def _assert_self_adjoint(self):
-    raise NotImplementedError("assert_self_adjoint is not implemented.")
+    dense = self._get_cached_dense_matrix()
+    logging.warn(
+        "Using (possibly slow) default implementation of assert_self_adjoint."
+        "  Requires conversion to a dense matrix.")
+    return check_ops.assert_equal(
+        dense,
+        linear_operator_util.matrix_adjoint(dense),
+        message="Matrix was not equal to its adjoint.")
 
   def assert_self_adjoint(self, name="assert_self_adjoint"):
-    """Returns an `Op` that asserts this operator is self-adjoint."""
+    """Returns an `Op` that asserts this operator is self-adjoint.
+
+    Here we check that this operator is *exactly* equal to its hermitian
+    transpose.
+
+    Args:
+      name:  A string name to prepend to created ops.
+
+    Returns:
+      An `Assert` `Op`, that, when run, will raise an `InvalidArgumentError` if
+        the operator is not self-adjoint.
+    """
     with self._name_scope(name):
       return self._assert_self_adjoint()
 
@@ -486,14 +573,29 @@ class LinearOperator(object):
     """Check that arg.dtype == self.dtype."""
     if arg.dtype != self.dtype:
       raise TypeError(
-          "Expected argument to have dtype %s.  Found: %s in tensor %s"
-          % (self.dtype, arg.dtype, arg))
+          "Expected argument to have dtype %s.  Found: %s in tensor %s" %
+          (self.dtype, arg.dtype, arg))
 
-  def _apply(self, x, adjoint=False, adjoint_arg=False):
-    raise NotImplementedError("_apply is not implemented.")
+  @abc.abstractmethod
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    raise NotImplementedError("_matmul is not implemented.")
 
-  def apply(self, x, adjoint=False, adjoint_arg=False, name="apply"):
-    """Transform `x` with left multiplication:  `x --> Ax`.
+  def matmul(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
+    """Transform [batch] matrix `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    X = ... # shape [..., N, R], batch matrix, R > 0.
+
+    Y = operator.matmul(X)
+    Y.shape
+    ==> [..., M, R]
+
+    Y[..., :, r] = sum_j A[..., :, j] X[j, r]
+    ```
 
     Args:
       x: `Tensor` with compatible shape and same `dtype` as `self`.
@@ -514,10 +616,54 @@ class LinearOperator(object):
       arg_dim = -1 if adjoint_arg else -2
       self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
 
-      return self._apply(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+      return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def _matvec(self, x, adjoint=False):
+    x_mat = array_ops.expand_dims(x, axis=-1)
+    y_mat = self.matmul(x_mat, adjoint=adjoint)
+    return array_ops.squeeze(y_mat, axis=-1)
+
+  def matvec(self, x, adjoint=False, name="matvec"):
+    """Transform [batch] vector `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matric A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+
+    X = ... # shape [..., N], batch vector
+
+    Y = operator.matvec(X)
+    Y.shape
+    ==> [..., M]
+
+    Y[..., :] = sum_j A[..., :, j] X[..., j]
+    ```
+
+    Args:
+      x: `Tensor` with compatible shape and same `dtype` as `self`.
+        `x` is treated as a [batch] vector meaning for every set of leading
+        dimensions, the last dimension defines a vector.
+        See class docstring for definition of compatibility.
+      adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
+      name:  A name for this `Op.
+
+    Returns:
+      A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
+    """
+    with self._name_scope(name, values=[x]):
+      x = ops.convert_to_tensor(x, name="x")
+      self._check_input_dtype(x)
+      self_dim = -2 if adjoint else -1
+      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[-1])
+      return self._matvec(x, adjoint=adjoint)
 
   def _determinant(self):
-    raise NotImplementedError("_det is not implemented.")
+    logging.warn(
+        "Using (possibly slow) default implementation of determinant."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    if self._can_use_cholesky():
+      return math_ops.exp(self.log_abs_determinant())
+    return linalg_ops.matrix_determinant(self._matrix)
 
   def determinant(self, name="det"):
     """Determinant for every batch member.
@@ -539,7 +685,14 @@ class LinearOperator(object):
       return self._determinant()
 
   def _log_abs_determinant(self):
-    raise NotImplementedError("_log_abs_det is not implemented.")
+    logging.warn(
+        "Using (possibly slow) default implementation of determinant."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    if self._can_use_cholesky():
+      diag = array_ops.matrix_diag_part(self._get_cached_chol())
+      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
+    abs_det = math_ops.abs(self.determinant())
+    return math_ops.log(abs_det)
 
   def log_abs_determinant(self, name="log_abs_det"):
     """Log absolute value of determinant for every batch member.
@@ -561,33 +714,47 @@ class LinearOperator(object):
       return self._log_abs_determinant()
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
-    # Since this is an exact solve method for all rhs, this will only be
-    # available for non-singular (batch) operators, in particular the operator
-    # must be square.
-    raise NotImplementedError("_solve is not implemented.")
+    """Default implementation of _solve."""
+    if self.is_square is False:
+      raise NotImplementedError(
+          "Solve is not yet implemented for non-square operators.")
+    logging.warn(
+        "Using (possibly slow) default implementation of solve."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
+    if self._can_use_cholesky():
+      return linalg_ops.cholesky_solve(self._get_cached_chol(), rhs)
+    return linalg_ops.matrix_solve(
+        self._get_cached_dense_matrix(), rhs, adjoint=adjoint)
 
   def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
-    """Solve `R` (batch) systems of equations exactly: `A X = rhs`.
+    """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
+
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
 
     Examples:
 
     ```python
-    # Create an operator acting like a 10 x 2 x 2 matrix.
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
     operator = LinearOperator(...)
-    operator.shape # = 10 x 2 x 2
+    operator.shape = [..., M, N]
 
-    # Solve one linear system (R = 1) for every member of the length 10 batch.
-    RHS = ... # shape 10 x 2 x 1
-    X = operator.solve(RHS)  # shape 10 x 2 x 1
+    # Solve R > 0 linear systems for every member of the batch.
+    RHS = ... # shape [..., M, R]
 
-    # Solve five linear systems (R = 5) for every member of the length 10 batch.
-    RHS = ... # shape 10 x 2 x 5
     X = operator.solve(RHS)
-    X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
+    # X[..., :, r] is the solution to the r'th linear system
+    # sum_j A[..., :, j] X[..., j, r] = RHS[..., :, r]
+
+    operator.matmul(X)
+    ==> RHS
     ```
 
     Args:
       rhs: `Tensor` with same `dtype` as this operator and compatible shape.
+        `rhs` is treated like a [batch] matrix meaning for every set of leading
+        dimensions, the last two dimensions defines a matrix.
         See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
         of this `LinearOperator`:  `A^H X = rhs`.
@@ -619,8 +786,63 @@ class LinearOperator(object):
 
       return self._solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
+  def _solvevec(self, rhs, adjoint=False):
+    """Default implementation of _solvevec."""
+    rhs_mat = array_ops.expand_dims(rhs, axis=-1)
+    solution_mat = self.solve(rhs_mat, adjoint=adjoint)
+    return array_ops.squeeze(solution_mat, axis=-1)
+
+  def solvevec(self, rhs, adjoint=False, name="solve"):
+    """Solve single equation with best effort: `A X = rhs`.
+
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
+
+    Examples:
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    # Solve one linear system for every member of the batch.
+    RHS = ... # shape [..., M]
+
+    X = operator.solvevec(RHS)
+    # X is the solution to the linear system
+    # sum_j A[..., :, j] X[..., j] = RHS[..., :]
+
+    operator.matvec(X)
+    ==> RHS
+    ```
+
+    Args:
+      rhs: `Tensor` with same `dtype` as this operator.
+        `rhs` is treated like a [batch] vector meaning for every set of leading
+        dimensions, the last dimension defines a vector.  See class docstring
+        for definition of compatibility regarding batch dimensions.
+      adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
+        of this `LinearOperator`:  `A^H X = rhs`.
+      name:  A name scope to use for ops added by this method.
+
+    Returns:
+      `Tensor` with shape `[...,N]` and same `dtype` as `rhs`.
+
+    Raises:
+      NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
+    """
+    with self._name_scope(name, values=[rhs]):
+      rhs = ops.convert_to_tensor(rhs, name="rhs")
+      self._check_input_dtype(rhs)
+      self_dim = -1 if adjoint else -2
+      self.shape[self_dim].assert_is_compatible_with(rhs.get_shape()[-1])
+
+      return self._solvevec(rhs, adjoint=adjoint)
+
   def _to_dense(self):
     """Generic and often inefficient implementation.  Override often."""
+    logging.warn("Using (possibly slow) default implementation of to_dense."
+                 "  Converts by self.matmul(identity).")
     if self.batch_shape.is_fully_defined():
       batch_shape = self.batch_shape
     else:
@@ -632,7 +854,7 @@ class LinearOperator(object):
       n = self.domain_dimension_tensor()
 
     eye = linalg_ops.eye(num_rows=n, batch_shape=batch_shape, dtype=self.dtype)
-    return self.apply(eye)
+    return self.matmul(eye)
 
   def to_dense(self, name="to_dense"):
     """Return a dense (batch) matrix representing this operator."""
@@ -641,7 +863,7 @@ class LinearOperator(object):
 
   def _diag_part(self):
     """Generic and often inefficient implementation.  Override often."""
-    return array_ops.matrix_diag_part(self.to_dense())
+    return array_ops.matrix_diag_part(self._get_cached_dense_matrix())
 
   def diag_part(self, name="diag_part"):
     """Efficiently get the [batch] diagonal part of this operator.
@@ -673,7 +895,7 @@ class LinearOperator(object):
 
   def _add_to_tensor(self, x):
     # Override if a more efficient implementation is available.
-    return self.to_dense() + x
+    return self._get_cached_dense_matrix() + x
 
   def add_to_tensor(self, x, name="add_to_tensor"):
     """Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
@@ -689,3 +911,18 @@ class LinearOperator(object):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
       return self._add_to_tensor(x)
+
+  def _can_use_cholesky(self):
+    # TODO(langmore) Add complex types when tf.cholesky can use them.
+    return (not self.dtype.is_complex and self.is_self_adjoint and
+            self.is_positive_definite)
+
+  def _get_cached_dense_matrix(self):
+    if not hasattr(self, "_cached_dense_matrix"):
+      self._cached_dense_matrix = self.to_dense()
+    return self._cached_dense_matrix
+
+  def _get_cached_chol(self):
+    if not hasattr(self, "_cached_chol"):
+      self._cached_chol = linalg_ops.cholesky(self._get_cached_dense_matrix())
+    return self._cached_chol
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py b/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py
index 7617e1b591d8f6dff3513a226e1faacb6dafe8d4..16c4c6e6d67f17d1674b8d1d39f006bc688bc6ce 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py
@@ -43,7 +43,7 @@ def add_operators(operators,
   Given operators `[A1, A2,...]`, this `Op` returns a possibly shorter list of
   operators `[B1, B2,...]` such that
 
-  ```sum_k Ak.apply(x) = sum_k Bk.apply(x).```
+  ```sum_k Ak.matmul(x) = sum_k Bk.matmul(x).```
 
   The operators `Bk` result by adding some of the `Ak`, as allowed by
   `addition_tiers`.
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
index b1557769b222cf5b1d4ce11210b2fb4ddebecacb..9dec621ab29d0bd19aa9cdbe1393755d68366b38 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
@@ -63,11 +63,11 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 5 linear operators.
@@ -83,7 +83,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
 
   # Create a shape [2, 3, 6, 2] vector.
   x = tf.random_normal(shape=[2, 3, 6, 2])
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 3, 4, 2] Tensor
   ```
 
@@ -96,7 +96,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -112,12 +112,13 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name=None):
     r"""Initialize a `LinearOperatorComposition`.
 
     `LinearOperatorComposition` is initialized with a list of operators
-    `[op_1,...,op_J]`.  For the `apply` method to be well defined, the
-    composition `op_i.apply(op_{i+1}(x))` must be defined.  Other methods have
+    `[op_1,...,op_J]`.  For the `matmul` method to be well defined, the
+    composition `op_i.matmul(op_{i+1}(x))` must be defined.  Other methods have
     similar constraints.
 
     Args:
@@ -132,6 +133,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.  Default is the individual
         operators names joined with `_o_`.
 
@@ -177,6 +179,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   @property
@@ -225,19 +228,19 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
 
     return array_ops.concat((batch_shape, matrix_shape), 0)
 
-  def _apply(self, x, adjoint=False, adjoint_arg=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     # If self.operators = [A, B], and not adjoint, then
-    # apply_order_list = [B, A].
-    # As a result, we return A.apply(B.apply(x))
+    # matmul_order_list = [B, A].
+    # As a result, we return A.matmul(B.matmul(x))
     if adjoint:
-      apply_order_list = self.operators
+      matmul_order_list = self.operators
     else:
-      apply_order_list = list(reversed(self.operators))
+      matmul_order_list = list(reversed(self.operators))
 
-    result = apply_order_list[0].apply(
+    result = matmul_order_list[0].matmul(
         x, adjoint=adjoint, adjoint_arg=adjoint_arg)
-    for operator in apply_order_list[1:]:
-      result = operator.apply(result, adjoint=adjoint)
+    for operator in matmul_order_list[1:]:
+      result = operator.matmul(result, adjoint=adjoint)
     return result
 
   def _determinant(self):
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
index 97e52d08a436e86941371c94d6a91d0d011f8483..56bc967706a9f2b15aabead4d6864d02e3e5ed08 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
@@ -52,11 +52,11 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
@@ -68,13 +68,13 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   # operator.batch_shape = [2, 3].
   y = tf.random_normal(shape=[2, 1, 4, 2])
   x = operator.solve(y)
-  ==> operator.apply(x) = y
+  ==> operator.matmul(x) = y
   ```
 
   #### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -87,7 +87,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   Suppose `operator` is a `LinearOperatorDiag` of shape `[N, N]`,
   and `x.shape = [N, R]`.  Then
 
-  * `operator.apply(x)` involves `N * R` multiplications.
+  * `operator.matmul(x)` involves `N * R` multiplications.
   * `operator.solve(x)` involves `N` divisions and `N * R` multiplications.
   * `operator.determinant()` involves a size `N` `reduce_prod`.
 
@@ -97,7 +97,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -113,6 +113,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorDiag"):
     r"""Initialize a `LinearOperatorDiag`.
 
@@ -129,6 +130,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
@@ -147,12 +149,17 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         else:
           is_self_adjoint = True
 
+      if is_square is False:
+        raise ValueError("Only square diagonal operators currently supported.")
+      is_square = True
+
       super(LinearOperatorDiag, self).__init__(
           dtype=self._diag.dtype,
           graph_parents=[self._diag],
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   def _check_diag(self, diag):
@@ -206,7 +213,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
             "This diagonal operator contained non-zero imaginary values.  "
             " Thus it was not self-adjoint."))
 
-  def _apply(self, x, adjoint=False, adjoint_arg=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     diag_term = math_ops.conj(self._diag) if adjoint else self._diag
     x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     diag_mat = array_ops.expand_dims(diag_term, -1)
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
index 64ab561457789776003ce56038c47ca32dacffdd..67889511cbffcbec934855d67914e40b157bdc91 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
@@ -19,11 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.linalg.python.ops import linear_operator
-from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 
 __all__ = ["LinearOperatorFullMatrix"]
@@ -49,11 +47,11 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
@@ -64,7 +62,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   #### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
@@ -83,7 +81,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   In all cases, suppose `operator` is a `LinearOperatorFullMatrix` of shape
   `[M, N]`, and `x.shape = [N, R]`.  Then
 
-  * `operator.apply(x)` is `O(M * N * R)`.
+  * `operator.matmul(x)` is `O(M * N * R)`.
   * If `M=N`, `operator.solve(x)` is `O(N^3 * R)`.
   * If `M=N`, `operator.determinant()` is `O(N^3)`.
 
@@ -93,7 +91,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -109,6 +107,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorFullMatrix"):
     r"""Initialize a `LinearOperatorFullMatrix`.
 
@@ -124,6 +123,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
@@ -134,19 +134,13 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
       self._matrix = ops.convert_to_tensor(matrix, name="matrix")
       self._check_matrix(self._matrix)
 
-      # Special treatment for (real) Symmetric Positive Definite.
-      self._is_spd = (
-          (not self._matrix.dtype.is_complex)
-          and is_self_adjoint and is_positive_definite)
-      if self._is_spd:
-        self._chol = linalg_ops.cholesky(self._matrix)
-
       super(LinearOperatorFullMatrix, self).__init__(
           dtype=self._matrix.dtype,
           graph_parents=[self._matrix],
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   def _check_matrix(self, matrix):
@@ -173,27 +167,9 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   def _shape_tensor(self):
     return array_ops.shape(self._matrix)
 
-  def _apply(self, x, adjoint=False, adjoint_arg=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     return math_ops.matmul(
         self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
-  def _determinant(self):
-    if self._is_spd:
-      return math_ops.exp(self.log_abs_determinant())
-    return linalg_ops.matrix_determinant(self._matrix)
-
-  def _log_abs_determinant(self):
-    if self._is_spd:
-      diag = array_ops.matrix_diag_part(self._chol)
-      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
-    abs_det = math_ops.abs(self.determinant())
-    return math_ops.log(abs_det)
-
-  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
-    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
-    if self._is_spd:
-      return linalg_ops.cholesky_solve(self._chol, rhs)
-    return linalg_ops.matrix_solve(self._matrix, rhs, adjoint=adjoint)
-
   def _to_dense(self):
     return self._matrix
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
index 845bf25192eafdb142e23d709e6737f5d790adeb..acba1c7035d738d878d801463b857104b98cfc83 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
@@ -112,11 +112,11 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> 0.
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor, same as x.
 
   y = tf.random_normal(shape=[3, 2, 4])
@@ -141,20 +141,20 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   # to detect that no broadcast is necessary because both x and the operator
   # have statically defined shape.
   x = ... Shape [2, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 2, 3] Tensor, same as x
 
   # Here the operator and x have different batch_shape, and are broadcast.
   # This requires a copy, since the output is different size than the input.
   x = ... Shape [1, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 2, 3] Tensor, equal to [x, x]
   ```
 
   ### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -166,21 +166,21 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
 
   If `batch_shape` initialization arg is `None`:
 
-  * `operator.apply(x)` is `O(1)`
+  * `operator.matmul(x)` is `O(1)`
   * `operator.solve(x)` is `O(1)`
   * `operator.determinant()` is `O(1)`
 
   If `batch_shape` initialization arg is provided, and static checks cannot
   rule out the need to broadcast:
 
-  * `operator.apply(x)` is `O(D1*...*Dd*N*R)`
+  * `operator.matmul(x)` is `O(D1*...*Dd*N*R)`
   * `operator.solve(x)` is `O(D1*...*Dd*N*R)`
   * `operator.determinant()` is `O(B1*...*Bb)`
 
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -198,6 +198,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
                is_non_singular=True,
                is_self_adjoint=True,
                is_positive_definite=True,
+               is_square=True,
                assert_proper_shapes=False,
                name="LinearOperatorIdentity"):
     r"""Initialize a `LinearOperatorIdentity`.
@@ -224,6 +225,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
         If `True`, and static checks are inconclusive, add asserts to the graph.
@@ -248,12 +250,15 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         raise ValueError("An identity operator is always non-singular.")
       if not is_positive_definite:
         raise ValueError("An identity operator is always positive-definite.")
+      if not is_square:
+        raise ValueError("An identity operator is always square.")
 
       super(LinearOperatorIdentity, self).__init__(
           dtype=dtype,
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
       self._num_rows = linear_operator_util.shape_tensor(
@@ -329,7 +334,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
     return x + zeros
 
-  def _apply(self, x, adjoint=False, adjoint_arg=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     # Note that adjoint has no effect since this matrix is self-adjoint.
     x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     if self._assert_proper_shapes:
@@ -345,7 +350,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     return array_ops.zeros(shape=self.batch_shape_tensor(), dtype=self.dtype)
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
-    return self._apply(rhs, adjoint_arg=adjoint_arg)
+    return self._matmul(rhs, adjoint_arg=adjoint_arg)
 
   def _diag_part(self):
     return self._ones_diag()
@@ -459,11 +464,11 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> 2 * Log[3]
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> 3 * x
 
   y = tf.random_normal(shape=[3, 2, 4])
@@ -481,19 +486,19 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         [0., 5.]]]
 
   x = ... Shape [2, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> 5 * x
 
   # Here the operator and x have different batch_shape, and are broadcast.
   x = ... Shape [1, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> 5 * x
   ```
 
   ### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -503,14 +508,14 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
 
   ### Performance
 
-  * `operator.apply(x)` is `O(D1*...*Dd*N*R)`
+  * `operator.matmul(x)` is `O(D1*...*Dd*N*R)`
   * `operator.solve(x)` is `O(D1*...*Dd*N*R)`
   * `operator.determinant()` is `O(D1*...*Dd)`
 
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -527,6 +532,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=True,
                assert_proper_shapes=False,
                name="LinearOperatorScaledIdentity"):
     r"""Initialize a `LinearOperatorScaledIdentity`.
@@ -550,6 +556,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
         If `True`, and static checks are inconclusive, add asserts to the graph.
@@ -561,6 +568,9 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     """
     self._assert_proper_shapes = assert_proper_shapes
 
+    if not is_square:
+      raise ValueError("A ScaledIdentity operator is always square.")
+
     with ops.name_scope(name, values=[multiplier, num_rows]):
       self._multiplier = ops.convert_to_tensor(multiplier, name="multiplier")
 
@@ -569,6 +579,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
       # Shape [B1,...Bb, 1, 1]
@@ -617,7 +628,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         imag_multiplier,
         message="LinearOperator was not self-adjoint")
 
-  def _apply(self, x, adjoint=False, adjoint_arg=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     if adjoint:
       matrix = self._multiplier_matrix_conj
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
index c8bc62eeef9d492bf1046e1d799e74864e98b157..b2d7b10157b02ff2814de12459b1e417c22128b5 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
@@ -116,7 +116,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   @abc.abstractmethod
   def _make_x(self, operator, adjoint):
-    """Make an 'x' appropriate for calling operator.apply(x).
+    """Make an 'x' appropriate for calling operator.matmul(x).
 
     Args:
       operator:  A `LinearOperator`
@@ -208,8 +208,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 feed_dict=feed_dict)
             self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
 
-  def test_apply(self):
-    self._skip_if_tests_to_skip_contains("apply")
+  def test_matmul(self):
+    self._skip_if_tests_to_skip_contains("matmul")
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
@@ -222,18 +222,18 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 x = self._make_x(operator, adjoint=adjoint)
                 # If adjoint_arg, compute A X^H^H = A X.
                 if adjoint_arg:
-                  op_apply = operator.apply(
+                  op_matmul = operator.matmul(
                       linear_operator_util.matrix_adjoint(x),
                       adjoint=adjoint, adjoint_arg=adjoint_arg)
                 else:
-                  op_apply = operator.apply(x, adjoint=adjoint)
-                mat_apply = math_ops.matmul(mat, x, adjoint_a=adjoint)
+                  op_matmul = operator.matmul(x, adjoint=adjoint)
+                mat_matmul = math_ops.matmul(mat, x, adjoint_a=adjoint)
                 if not use_placeholder:
                   self.assertAllEqual(
-                      op_apply.get_shape(), mat_apply.get_shape())
-                op_apply_v, mat_apply_v = sess.run([op_apply, mat_apply],
-                                                   feed_dict=feed_dict)
-                self.assertAC(op_apply_v, mat_apply_v)
+                      op_matmul.get_shape(), mat_matmul.get_shape())
+                op_matmul_v, mat_matmul_v = sess.run(
+                    [op_matmul, mat_matmul], feed_dict=feed_dict)
+                self.assertAC(op_matmul_v, mat_matmul_v)
 
   def test_solve(self):
     self._skip_if_tests_to_skip_contains("solve")
@@ -376,7 +376,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         "_make_rhs not implemented because we don't test solve")
 
   def _make_x(self, operator, adjoint):
-    # Return the number of systems for the argument 'x' for .apply(x)
+    # Return the number of systems for the argument 'x' for .matmul(x)
     r = self._get_num_systems(operator)
     # If operator.shape = [B1,...,Bb, M, N] this returns a random matrix of
     # shape [B1,...,Bb, N, R], R = 1 or 2.
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
index 756e26cc130d4d6c4add5d580061be15b277ba6c..8a152a9b475f4e3fdfd8e3045ab1028eb467997b 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
@@ -53,11 +53,11 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
@@ -68,7 +68,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   #### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -80,7 +80,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   Suppose `operator` is a `LinearOperatorTriL` of shape `[N, N]`,
   and `x.shape = [N, R]`.  Then
 
-  * `operator.apply(x)` involves `N^2 * R` multiplications.
+  * `operator.matmul(x)` involves `N^2 * R` multiplications.
   * `operator.solve(x)` involves `N * R` size `N` back-substitutions.
   * `operator.determinant()` involves a size `N` `reduce_prod`.
 
@@ -90,7 +90,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -106,6 +106,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorTriL"):
     r"""Initialize a `LinearOperatorTriL`.
 
@@ -126,12 +127,19 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
       TypeError:  If `diag.dtype` is not an allowed type.
+      ValueError:  If `is_square` is `False`.
     """
 
+    if is_square is False:
+      raise ValueError(
+          "Only square lower triangular operators supported at this time.")
+    is_square = True
+
     with ops.name_scope(name, values=[tril]):
       self._tril = ops.convert_to_tensor(tril, name="tril")
       self._check_tril(self._tril)
@@ -144,6 +152,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   def _check_tril(self, tril):
@@ -173,7 +182,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
         self._diag,
         message="Singular operator:  Diagonal contained zero values.")
 
-  def _apply(self, x, adjoint=False, adjoint_arg=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     return math_ops.matmul(
         self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py b/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
index 4ca77ab147181190dfc64c6c7b2a0f178e1c2fd4..546d899e74e53d529dd58fc75a4e06f2fb920d1b 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
@@ -74,18 +74,18 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
   operator.shape
   ==> [3, 3]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [3, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [3, 4] Tensor
   ```
 
   ### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
@@ -95,15 +95,15 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
   ### Performance
 
   Suppose `operator` is a `LinearOperatorUDVHUpdate` of shape `[M, N]`,
-  made from a rank `K` update of `base_operator` which performs `.apply(x)` on
-  `x` having `x.shape = [N, R]` with `O(L_apply*N*R)` complexity (and similarly
+  made from a rank `K` update of `base_operator` which performs `.matmul(x)` on
+  `x` having `x.shape = [N, R]` with `O(L_matmul*N*R)` complexity (and similarly
   for `solve`, `determinant`.  Then, if `x.shape = [N, R]`,
 
-  * `operator.apply(x)` is `O(L_apply*N*R + K*N*R)`
+  * `operator.matmul(x)` is `O(L_matmul*N*R + K*N*R)`
 
   and if `M = N`,
 
-  * `operator.solve(x)` is `O(L_apply*N*R + N*K*R + K^2*R + K^3)`
+  * `operator.solve(x)` is `O(L_matmul*N*R + N*K*R + K^2*R + K^3)`
   * `operator.determinant()` is `O(L_determinant + L_solve*N*K + K^2*N + K^3)`
 
   If instead `operator` and `x` have shape `[B1,...,Bb, M, N]` and
@@ -348,22 +348,22 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
     return array_ops.concat(
         [batch_shape, self.base_operator.shape_tensor()[-2:]], axis=0)
 
-  def _apply(self, x, adjoint=False, adjoint_arg=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     u = self.u
     v = self.v
     l = self.base_operator
     d = self.diag_operator
 
-    leading_term = l.apply(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+    leading_term = l.matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
     if adjoint:
       uh_x = math_ops.matmul(u, x, adjoint_a=True, adjoint_b=adjoint_arg)
-      d_uh_x = d.apply(uh_x, adjoint=adjoint)
+      d_uh_x = d.matmul(uh_x, adjoint=adjoint)
       v_d_uh_x = math_ops.matmul(v, d_uh_x)
       return leading_term + v_d_uh_x
     else:
       vh_x = math_ops.matmul(v, x, adjoint_a=True, adjoint_b=adjoint_arg)
-      d_vh_x = d.apply(vh_x, adjoint=adjoint)
+      d_vh_x = d.matmul(vh_x, adjoint=adjoint)
       u_d_vh_x = math_ops.matmul(u, d_vh_x)
       return leading_term + u_d_vh_x
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
index 9f8cb23169399fb499f9c80892473596d99bf248..2659bd32e9a96b2117b7af1350e8773e1321d855 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
@@ -69,10 +69,10 @@ def assert_zero_imag_part(x, message=None, name="assert_zero_imag_part"):
 
 
 def assert_compatible_matrix_dimensions(operator, x):
-  """Assert that an argument to solve/apply has proper domain dimension.
+  """Assert that an argument to solve/matmul has proper domain dimension.
 
   If `operator.shape[-2:] = [M, N]`, and `x.shape[-2:] = [Q, R]`, then
-  `operator.apply(x)` is defined only if `N = Q`.  This `Op` returns an
+  `operator.matmul(x)` is defined only if `N = Q`.  This `Op` returns an
   `Assert` that "fires" if this is not the case.  Static checks are already
   done by the base class `LinearOperator`.
 
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index 21f02b0a9674c17fd2c143e7697a65fc9c41d2b5..1fde6e5c6cb0e2d6097c63dcd707c35a491acaaa 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -111,13 +111,11 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":sdca_ops_py",
-        ":sparse_feature_column_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
@@ -129,6 +127,7 @@ py_test(
     name = "sdca_estimator_test",
     srcs = ["python/sdca_estimator_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":sdca_estimator_py",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index 733b03eed36251a4079f48619fc85274981118b8..f4961ab9dbf98905df65c3b5be057fde1edca768 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -24,13 +24,10 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
-from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
-from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import session_run_hook
 
@@ -76,131 +73,6 @@ def _add_bias_column(feature_columns, columns_to_tensors, bias_variable,
   columns_to_variables[bias_column] = [bias_variable]
 
 
-def _get_sdca_train_step(optimizer, columns_to_variables, weight_column_name,
-                         loss_type, features, targets, global_step):
-  """Returns the training operation of an SdcaModel optimizer."""
-
-  def _dense_tensor_to_sparse_feature_column(dense_tensor):
-    """Returns SparseFeatureColumn for the input dense_tensor."""
-    ignore_value = 0.0
-    sparse_indices = array_ops.where(
-        math_ops.not_equal(dense_tensor,
-                           math_ops.cast(ignore_value, dense_tensor.dtype)))
-    sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
-    # TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
-    # very sparse features with weights and not weights.
-    return SparseFeatureColumn(
-        array_ops.reshape(
-            array_ops.split(value=sparse_indices, num_or_size_splits=2,
-                            axis=1)[0], [-1]),
-        array_ops.reshape(
-            array_ops.split(value=sparse_indices, num_or_size_splits=2,
-                            axis=1)[1], [-1]),
-        array_ops.reshape(math_ops.to_float(sparse_values), [-1]))
-
-  def _training_examples_and_variables():
-    """Returns dictionaries for training examples and variables."""
-    batch_size = targets.get_shape()[0]
-
-    # Iterate over all feature columns and create appropriate lists for dense
-    # and sparse features as well as dense and sparse weights (variables) for
-    # SDCA.
-    # TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables
-    # dict as 1-dimensional tensors.
-    dense_features, sparse_features, sparse_feature_with_values = [], [], []
-    dense_feature_weights = []
-    sparse_feature_weights, sparse_feature_with_values_weights = [], []
-    for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
-      transformed_tensor = features[column]
-      if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
-        # A real-valued column corresponds to a dense feature in SDCA. A
-        # transformed tensor corresponding to a RealValuedColumn has rank 2
-        # (its shape is typically [batch_size, column.dimension]) and so it
-        # can be passed to SDCA as is.
-        dense_features.append(transformed_tensor)
-        # For real valued columns, the variables list contains exactly one
-        # element.
-        dense_feature_weights.append(columns_to_variables[column][0])
-      elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
-        # A bucketized column corresponds to a sparse feature in SDCA. The
-        # bucketized feature is "sparsified" for SDCA by converting it to a
-        # SparseFeatureColumn respresenting the one-hot encoding of the
-        # bucketized feature.
-        #
-        # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
-        # bucketized feature column to a dense feature in SDCA. This will likely
-        # depend on the number of buckets.
-        dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor)  # pylint: disable=protected-access
-        sparse_feature_column = _dense_tensor_to_sparse_feature_column(
-            dense_bucket_tensor)
-        sparse_feature_with_values.append(sparse_feature_column)
-        # For bucketized columns, the variables list contains exactly one
-        # element.
-        sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
-      elif isinstance(
-          column,
-          (
-              layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
-              layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
-        sparse_features.append(
-            SparseFeatureColumn(
-                array_ops.reshape(
-                    array_ops.split(
-                        value=transformed_tensor.indices,
-                        num_or_size_splits=2,
-                        axis=1)[0], [-1]),
-                array_ops.reshape(transformed_tensor.values, [-1]), None))
-        sparse_feature_weights.append(columns_to_variables[column][0])
-      elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
-        id_tensor = column.id_tensor(transformed_tensor)
-        weight_tensor = column.weight_tensor(transformed_tensor)
-        sparse_feature_with_values.append(
-            SparseFeatureColumn(
-                array_ops.reshape(
-                    array_ops.split(
-                        value=id_tensor.indices, num_or_size_splits=2, axis=1)[
-                            0], [-1]),
-                array_ops.reshape(id_tensor.values, [-1]),
-                array_ops.reshape(weight_tensor.values, [-1])))
-        sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
-      else:
-        raise ValueError("SDCAOptimizer does not support column type {}".format(
-            type(column).__name__))
-
-    example_weights = array_ops.reshape(
-        features[weight_column_name],
-        shape=[-1]) if weight_column_name else array_ops.ones([batch_size])
-    example_ids = features[optimizer.example_id_column]
-    sparse_feature_with_values.extend(sparse_features)
-    sparse_feature_with_values_weights.extend(sparse_feature_weights)
-    examples = dict(
-        sparse_features=sparse_feature_with_values,
-        dense_features=dense_features,
-        example_labels=math_ops.to_float(
-            array_ops.reshape(targets, shape=[-1])),
-        example_weights=example_weights,
-        example_ids=example_ids)
-    sdca_variables = dict(
-        sparse_features_weights=sparse_feature_with_values_weights,
-        dense_features_weights=dense_feature_weights)
-    return examples, sdca_variables
-
-  training_examples, training_variables = _training_examples_and_variables()
-  sdca_model = sdca_ops.SdcaModel(
-      examples=training_examples,
-      variables=training_variables,
-      options=dict(
-          symmetric_l1_regularization=optimizer.symmetric_l1_regularization,
-          symmetric_l2_regularization=optimizer.symmetric_l2_regularization,
-          num_loss_partitions=optimizer.num_loss_partitions,
-          num_table_shards=optimizer.num_table_shards,
-          loss_type=loss_type))
-  train_op = sdca_model.minimize(global_step=global_step)
-  return sdca_model, train_op
-
-
 def sdca_model_fn(features, labels, mode, params, config=None):
   """A model_fn for linear models that use the SDCA optimizer.
 
@@ -283,9 +155,9 @@ def sdca_model_fn(features, labels, mode, params, config=None):
 
   def _train_op_fn(unused_loss):
     global_step = contrib_variables.get_global_step()
-    sdca_model, train_op = _get_sdca_train_step(optimizer, columns_to_variables,
-                                                weight_column_name, loss_type,
-                                                features, labels, global_step)
+    sdca_model, train_op = optimizer.get_train_step(
+        columns_to_variables, weight_column_name, loss_type, features, labels,
+        global_step)
     if update_weights_hook is not None:
       update_weights_hook.set_parameters(sdca_model, train_op)
     return train_op
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 81434621bd6fbc0e80ddc9e0006122ddc10e48df..32b7f956e476ca79cc77338cde496cd0c517c401 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -35,7 +35,7 @@ class SDCALogisticClassifierTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2']),
-          'maintenance_cost': constant_op.constant([[500.0], [200.0]]),
+          'maintenance_cost': constant_op.constant([500.0, 200.0]),
           'sq_footage': constant_op.constant([[800.0], [600.0]]),
           'weights': constant_op.constant([[1.0], [1.0]])
       }, constant_op.constant([[0], [1]])
@@ -77,7 +77,7 @@ class SDCALogisticClassifierTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2', '3']),
-          'price': constant_op.constant([[600.0], [1000.0], [400.0]]),
+          'price': constant_op.constant([600.0, 1000.0, 400.0]),
           'sq_footage': constant_op.constant([[1000.0], [600.0], [700.0]]),
           'weights': constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
@@ -196,7 +196,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'price':
               constant_op.constant([[0.6], [0.8], [0.3]]),
           'sq_footage':
-              constant_op.constant([[900.0], [700.0], [600.0]]),
+              constant_op.constant([900.0, 700.0, 600.0]),
           'country':
               sparse_tensor.SparseTensor(
                   values=['IT', 'US', 'GB'],
@@ -296,7 +296,7 @@ class SDCALinearRegressorTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.4], [0.6], [0.3]]),
+              constant_op.constant([0.4, 0.6, 0.3]),
           'country':
               sparse_tensor.SparseTensor(
                   values=['IT', 'US', 'GB'],
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index f9d69d6dea9dd3ac3eca3b64be15c8dd131b862d..92d022f2a30ffeb77e81d3bd01365afcd14826b5 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -19,7 +19,9 @@ from __future__ import print_function
 from tensorflow.contrib import layers
 from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
 from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -99,16 +101,16 @@ class SDCAOptimizer(object):
   def symmetric_l2_regularization(self):
     return self._symmetric_l2_regularization
 
-  def get_train_step(self, columns_to_variables,
-                     weight_column_name, loss_type, features, targets,
-                     global_step):
+  def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
+                     features, targets, global_step):
     """Returns the training operation of an SdcaModel optimizer."""
 
-    def _tensor_to_sparse_feature_column(dense_tensor):
+    def _dense_tensor_to_sparse_feature_column(dense_tensor):
       """Returns SparseFeatureColumn for the input dense_tensor."""
       ignore_value = 0.0
-      sparse_indices = array_ops.where(math_ops.not_equal(
-          dense_tensor, math_ops.cast(ignore_value, dense_tensor.dtype)))
+      sparse_indices = array_ops.where(
+          math_ops.not_equal(dense_tensor,
+                             math_ops.cast(ignore_value, dense_tensor.dtype)))
       sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
       # TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
       # very sparse features with weights and not weights.
@@ -133,34 +135,48 @@ class SDCAOptimizer(object):
       dense_features, sparse_features, sparse_feature_with_values = [], [], []
       dense_feature_weights = []
       sparse_feature_weights, sparse_feature_with_values_weights = [], []
-      # pylint: disable=protected-access
       for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
         transformed_tensor = features[column]
-        if isinstance(column, layers.feature_column._RealValuedColumn):
+        if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
           # A real-valued column corresponds to a dense feature in SDCA. A
-          # transformed tensor corresponding to a RealValuedColumn has rank 2
-          # (its shape is typically [batch_size, column.dimension]) and so it
-          # can be passed to SDCA as is.
+          # transformed tensor corresponding to a RealValuedColumn should have
+          # rank at most 2. In order to be passed to SDCA, its rank needs to be
+          # exactly 2 (i.e., its shape should be [batch_size, column.dim]).
+          check_rank_op = control_flow_ops.Assert(
+              math_ops.less_equal(array_ops.rank(transformed_tensor), 2),
+              ['transformed_tensor shouls have rank at most 2.'])
+          # Reshape to [batch_size, dense_column_dimension].
+          with ops.control_dependencies([check_rank_op]):
+            transformed_tensor = array_ops.reshape(transformed_tensor, [
+                array_ops.shape(transformed_tensor)[0], -1
+            ])
+
           dense_features.append(transformed_tensor)
           # For real valued columns, the variables list contains exactly one
           # element.
           dense_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._BucketizedColumn):
+        elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
           # A bucketized column corresponds to a sparse feature in SDCA. The
           # bucketized feature is "sparsified" for SDCA by converting it to a
           # SparseFeatureColumn respresenting the one-hot encoding of the
           # bucketized feature.
-          dense_bucket_tensor = layers.input_from_feature_columns(
-              {column: transformed_tensor}, [column])
-          sparse_feature_column = _tensor_to_sparse_feature_column(
+          #
+          # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
+          # bucketized feature column to a dense feature in SDCA. This will
+          # likely depend on the number of buckets.
+          dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor)  # pylint: disable=protected-access
+          sparse_feature_column = _dense_tensor_to_sparse_feature_column(
               dense_bucket_tensor)
           sparse_feature_with_values.append(sparse_feature_column)
           # For bucketized columns, the variables list contains exactly one
           # element.
           sparse_feature_with_values_weights.append(
               columns_to_variables[column][0])
-        elif isinstance(column, (layers.feature_column._CrossedColumn,
-                                 layers.feature_column._SparseColumn)):
+        elif isinstance(
+            column,
+            (
+                layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
+                layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
           sparse_features.append(
               SparseFeatureColumn(
                   array_ops.reshape(
@@ -168,10 +184,9 @@ class SDCAOptimizer(object):
                           value=transformed_tensor.indices,
                           num_or_size_splits=2,
                           axis=1)[0], [-1]),
-                  array_ops.reshape(transformed_tensor.values, [-1]),
-                  None))
+                  array_ops.reshape(transformed_tensor.values, [-1]), None))
           sparse_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._WeightedSparseColumn):
+        elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
           id_tensor = column.id_tensor(transformed_tensor)
           weight_tensor = column.weight_tensor(transformed_tensor)
           sparse_feature_with_values.append(
@@ -183,11 +198,10 @@ class SDCAOptimizer(object):
                   array_ops.reshape(id_tensor.values, [-1]),
                   array_ops.reshape(weight_tensor.values, [-1])))
           sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
+              columns_to_variables[column][0])
         else:
           raise ValueError('SDCAOptimizer does not support column type %s.' %
                            type(column).__name__)
-      # pylint: enable=protected-access
 
       example_weights = array_ops.reshape(
           features[weight_column_name],
@@ -195,12 +209,13 @@ class SDCAOptimizer(object):
       example_ids = features[self._example_id_column]
       sparse_feature_with_values.extend(sparse_features)
       sparse_feature_with_values_weights.extend(sparse_feature_weights)
-      examples = dict(sparse_features=sparse_feature_with_values,
-                      dense_features=dense_features,
-                      example_labels=math_ops.to_float(array_ops.reshape(
-                          targets, shape=[-1])),
-                      example_weights=example_weights,
-                      example_ids=example_ids)
+      examples = dict(
+          sparse_features=sparse_feature_with_values,
+          dense_features=dense_features,
+          example_labels=math_ops.to_float(
+              array_ops.reshape(targets, shape=[-1])),
+          example_weights=example_weights,
+          example_ids=example_ids)
       sdca_variables = dict(
           sparse_features_weights=sparse_feature_with_values_weights,
           dense_features_weights=dense_feature_weights)
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index b3316ee8c4fe167385dcc33135cc877c81a3509d..b0475c41c954713f0711fd497710478bacfdece4 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -9,6 +9,7 @@ package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+# TODO(yleon): Refactor after one we switching to the V2 kernels.
 py_library(
     name = "lookup_py",
     srcs = [
@@ -19,9 +20,9 @@ py_library(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops_gen",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:lookup_ops_gen",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
@@ -39,11 +40,11 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index eec197782da73863cf614dc7de6c257ceefb6616..b415235b994c01ac7b0e5c9950aa7a8a16b525dd 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Lookup table Operations."""
-# pylint: disable=g-bad-name
+"""Lookup table operations."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -27,7 +27,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import gen_lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.training.saver import BaseSaverBuilder
@@ -151,7 +151,7 @@ class InitializableLookupTableBase(LookupInterface):
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as scope:
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=scope)
+      return gen_lookup_ops._lookup_table_size(self._table_ref, name=scope)
       # pylint: enable=protected-access
 
   def lookup(self, keys, name=None):
@@ -182,7 +182,7 @@ class InitializableLookupTableBase(LookupInterface):
         name, "%s_Lookup" % self._name,
         (self._table_ref, key_tensor, self._default_value)) as scope:
       # pylint: disable=protected-access
-      values = gen_data_flow_ops._lookup_table_find(
+      values = gen_lookup_ops._lookup_table_find(
           self._table_ref, key_tensor, self._default_value, name=scope)
       # pylint: enable=protected-access
 
@@ -229,7 +229,7 @@ class HashTable(InitializableLookupTableBase):
     with ops.name_scope(
         name, "hash_table", (initializer, default_value)) as scope:
       # pylint: disable=protected-access
-      table_ref = gen_data_flow_ops._hash_table(
+      table_ref = gen_lookup_ops._hash_table(
           shared_name=shared_name,
           key_dtype=initializer.key_dtype,
           value_dtype=initializer.value_dtype,
@@ -308,10 +308,8 @@ class KeyValueTensorInitializer(TableInitializerBase):
         self._name,
         values=(table.table_ref, self._keys, self._values)) as scope:
       # pylint: disable=protected-access
-      init_op = gen_data_flow_ops._initialize_table(table.table_ref,
-                                                    self._keys,
-                                                    self._values,
-                                                    name=scope)
+      init_op = gen_lookup_ops._initialize_table(
+          table.table_ref, self._keys, self._values, name=scope)
       # pylint: enable=protected-access
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     return init_op
@@ -477,7 +475,7 @@ class TextFileInitializer(TableInitializerBase):
                                        dtypes.string,
                                        name="asset_filepath")
       # pylint: disable=protected-access
-      init_op = gen_data_flow_ops._initialize_table_from_text_file(
+      init_op = gen_lookup_ops._initialize_table_from_text_file(
           table.table_ref,
           filename,
           self._key_index,
@@ -608,7 +606,7 @@ class HasherSpec(collections.namedtuple("HasherSpec", ["hasher", "key"])):
   __slots__ = ()
 
 
-FastHashSpec = HasherSpec("fasthash", None)
+FastHashSpec = HasherSpec("fasthash", None)  # pylint: disable=invalid-name
 
 
 class StrongHashSpec(HasherSpec):
@@ -882,7 +880,7 @@ def index_table_from_file(vocabulary_file=None,
     name: A name for this op (optional).
 
   Returns:
-    The lookup table to map a string `Tensor` to index `int64` `Tensor`.
+    The lookup table to map a `key_dtype` `Tensor` to index `int64` `Tensor`.
 
   Raises:
     ValueError: If `vocabulary_file` is not set.
@@ -974,7 +972,7 @@ def index_table_from_tensor(mapping,
   Sample Usages:
 
   ```python
-  mapping_strings = t.constant(["emerson", "lake", "palmer")
+  mapping_strings = t.constant(["emerson", "lake", "palmer"])
   table = tf.contrib.lookup.index_table_from_tensor(
       mapping=mapping_strings, num_oov_buckets=1, default_value=-1)
   features = tf.constant(["emerson", "lake", "and", "palmer"])
@@ -1066,7 +1064,7 @@ def string_to_index(tensor, mapping, default_value=-1, name=None):
   For example:
 
   ```python
-  mapping_strings = tf.constant(["emerson", "lake", "palmer")
+  mapping_strings = tf.constant(["emerson", "lake", "palmer"])
   feats = tf.constant(["emerson", "lake", "and", "palmer"])
   ids = tf.contrib.lookup.string_to_index(
       feats, mapping=mapping_strings, default_value=-1)
@@ -1333,14 +1331,14 @@ class MutableHashTable(LookupInterface):
     use_node_name_sharing = checkpoint and shared_name is None
     # pylint: disable=protected-access
     if self._default_value.get_shape().ndims == 0:
-      self._table_ref = gen_data_flow_ops._mutable_hash_table(
+      self._table_ref = gen_lookup_ops._mutable_hash_table(
           shared_name=shared_name,
           use_node_name_sharing=use_node_name_sharing,
           key_dtype=key_dtype,
           value_dtype=value_dtype,
           name=name)
     else:
-      self._table_ref = gen_data_flow_ops._mutable_hash_table_of_tensors(
+      self._table_ref = gen_lookup_ops._mutable_hash_table_of_tensors(
           shared_name=shared_name,
           use_node_name_sharing=use_node_name_sharing,
           key_dtype=key_dtype,
@@ -1368,7 +1366,7 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as name:
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=name)
+      return gen_lookup_ops._lookup_table_size(self._table_ref, name=name)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -1394,10 +1392,8 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
                         (self._table_ref, keys, self._default_value)) as name:
       # pylint: disable=protected-access
-      values = gen_data_flow_ops._lookup_table_find(self._table_ref,
-                                                    keys,
-                                                    self._default_value,
-                                                    name=name)
+      values = gen_lookup_ops._lookup_table_find(
+          self._table_ref, keys, self._default_value, name=name)
 
     values.set_shape(keys.get_shape().concatenate(self._value_shape))
     return values
@@ -1423,7 +1419,7 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
       # pylint: disable=protected-access
-      op = gen_data_flow_ops._lookup_table_insert(
+      op = gen_lookup_ops._lookup_table_insert(
           self._table_ref, keys, values, name=name)
       return op
 
@@ -1440,11 +1436,8 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
                         [self._table_ref]) as name:
       # pylint: disable=protected-access
-      exported_keys, exported_values = gen_data_flow_ops._lookup_table_export(
-          self._table_ref,
-          self._key_dtype,
-          self._value_dtype,
-          name=name)
+      exported_keys, exported_values = gen_lookup_ops._lookup_table_export(
+          self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
         self._value_shape))
@@ -1464,7 +1457,7 @@ class MutableHashTable(LookupInterface):
 
     def restore(self, restored_tensors, unused_restored_shapes):
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_import(
+      return gen_lookup_ops._lookup_table_import(
           self.op._table_ref, restored_tensors[0], restored_tensors[1])
 
 
@@ -1539,7 +1532,7 @@ class MutableDenseHashTable(LookupInterface):
     use_node_name_sharing = checkpoint and shared_name is None
     empty_key = ops.convert_to_tensor(empty_key, dtype=key_dtype)
     # pylint: disable=protected-access
-    self._table_ref = gen_data_flow_ops._mutable_dense_hash_table(
+    self._table_ref = gen_lookup_ops._mutable_dense_hash_table(
         empty_key=empty_key,
         shared_name=shared_name,
         use_node_name_sharing=use_node_name_sharing,
@@ -1567,7 +1560,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as name:
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=name)
+      return gen_lookup_ops._lookup_table_size(self._table_ref, name=name)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -1593,7 +1586,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
                         [self._table_ref, keys]) as name:
       # pylint: disable=protected-access
-      values = gen_data_flow_ops._lookup_table_find(
+      values = gen_lookup_ops._lookup_table_find(
           self._table_ref, keys, self._default_value, name=name)
 
     if keys.get_shape().ndims is not None and keys.get_shape().ndims > 0:
@@ -1623,7 +1616,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
       # pylint: disable=protected-access
-      op = gen_data_flow_ops._lookup_table_insert(
+      op = gen_lookup_ops._lookup_table_insert(
           self._table_ref, keys, values, name=name)
       return op
 
@@ -1640,7 +1633,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
                         [self._table_ref]) as name:
       # pylint: disable=protected-access
-      exported_keys, exported_values = gen_data_flow_ops._lookup_table_export(
+      exported_keys, exported_values = gen_lookup_ops._lookup_table_export(
           self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
@@ -1661,6 +1654,5 @@ class MutableDenseHashTable(LookupInterface):
 
     def restore(self, restored_tensors, unused_restored_shapes):
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_import(self.op._table_ref,
-                                                    restored_tensors[0],
-                                                    restored_tensors[1])
+      return gen_lookup_ops._lookup_table_import(
+          self.op._table_ref, restored_tensors[0], restored_tensors[1])
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 0ec40a63f26e7139bda8cc73dcec034ff47a0532..5ec169b6db4f60439a3b9f233e30a862669fa7de 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -125,7 +125,7 @@ class HashTableOpTest(test.TestCase):
       table3 = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(3, table1.size().eval())
       self.assertAllEqual(3, table2.size().eval())
       self.assertAllEqual(3, table3.size().eval())
@@ -1184,7 +1184,7 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_int32_index_table_from_file(self):
@@ -1198,7 +1198,7 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_int64_index_table_from_file(self):
@@ -1212,7 +1212,7 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_index_table_from_file_with_default_value(self):
@@ -1224,7 +1224,7 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, default_value), ids.eval())
 
   def test_index_table_from_file_with_oov_buckets(self):
@@ -1236,7 +1236,7 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(
           (
               1,  # From vocabulary file.
@@ -1259,7 +1259,7 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, -1, -1), ids.eval())
       self.assertEqual(2, table.size().eval())
 
@@ -1286,7 +1286,7 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, -1), ids.eval())
       self.assertEqual(3, table.size().eval())
 
@@ -1345,7 +1345,7 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_int32_index_table_from_tensor_with_tensor_init(self):
@@ -1356,7 +1356,7 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_int64_index_table_from_tensor_with_tensor_init(self):
@@ -1367,7 +1367,7 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_index_table_from_tensor_with_default_value(self):
@@ -1378,7 +1378,7 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, default_value), ids.eval())
 
   def test_index_table_from_tensor_missing_mapping(self):
@@ -1394,7 +1394,7 @@ class IndexTableFromTensor(test.TestCase):
       self.assertRaises(errors_impl.OpError, ids.eval)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "keys and values cannot be empty"):
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
   def test_index_table_from_tensor_with_invalid_hashers(self):
     with self.test_session():
@@ -1422,7 +1422,7 @@ class StringToIndexTest(test.TestCase):
       indices = lookup.string_to_index(feats, mapping=mapping_strings)
 
       self.assertRaises(errors_impl.OpError, indices.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((1, 2, -1), indices.eval())
 
@@ -1433,7 +1433,7 @@ class StringToIndexTest(test.TestCase):
       _ = lookup.string_to_index(feats, mapping=mapping_strings)
 
       self.assertRaises(errors_impl.OpError,
-                        data_flow_ops.tables_initializer().run)
+                        lookup_ops.tables_initializer().run)
 
   def test_string_to_index_with_default_value(self):
     default_value = -42
@@ -1444,7 +1444,7 @@ class StringToIndexTest(test.TestCase):
           feats, mapping=mapping_strings, default_value=default_value)
       self.assertRaises(errors_impl.OpError, indices.eval)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, default_value), indices.eval())
 
 
@@ -1463,7 +1463,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file)
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           features.eval())
 
@@ -1475,7 +1475,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           features.eval())
 
@@ -1489,7 +1489,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
           default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", default_value, default_value),
                           features.eval())
 
@@ -1501,7 +1501,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, features.eval)
-      init = data_flow_ops.tables_initializer()
+      init = lookup_ops.tables_initializer()
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", init.run)
 
@@ -1513,7 +1513,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
 
 
@@ -1528,7 +1528,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       features = table.lookup(indices)
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           features.eval())
@@ -1540,7 +1540,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
           mapping=mapping_strings)
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       features = table.lookup(indices)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
 
   def test_index_to_string_with_default_value(self):
@@ -1553,7 +1553,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       features = table.lookup(indices)
       self.assertRaises(errors_impl.OpError, features.eval)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           features.eval())
 
@@ -1567,7 +1567,7 @@ class IndexToStringTest(test.TestCase):
       feats = lookup.index_to_string(indices, mapping=mapping_strings)
 
       self.assertRaises(errors_impl.OpError, feats.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           feats.eval())
@@ -1577,11 +1577,11 @@ class IndexToStringTest(test.TestCase):
       mapping_strings = constant_op.constant(["hello", "hello"])
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       feats = lookup.index_to_string(indices, mapping=mapping_strings)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"hello", b"hello", b"UNK"), feats.eval())
 
       self.assertRaises(errors_impl.OpError,
-                        data_flow_ops.tables_initializer().run)
+                        lookup_ops.tables_initializer().run)
 
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
@@ -1592,7 +1592,7 @@ class IndexToStringTest(test.TestCase):
           indices, mapping=mapping_strings, default_value=default_value)
       self.assertRaises(errors_impl.OpError, feats.eval)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value), feats.eval())
 
 
@@ -1755,7 +1755,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           default_value,
           shared_name=shared_name)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
 
@@ -2081,7 +2081,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           hasher_spec=lookup.StrongHashSpec((1, 2)),
           name="table2")
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       input_string = constant_op.constant(
           ["fruit", "brain", "salad", "surgery", "UNK"])
@@ -2167,7 +2167,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               default_value2),
           oov_buckets)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       input_string_1 = constant_op.constant(
           ["brain", "salad", "surgery", "UNK"])
diff --git a/tensorflow/contrib/losses/__init__.py b/tensorflow/contrib/losses/__init__.py
index 9861ecc1f87a0f453e07b267f727f0c44439cd61..790bf61367d85b79bae4b153328b229b10721b38 100644
--- a/tensorflow/contrib/losses/__init__.py
+++ b/tensorflow/contrib/losses/__init__.py
@@ -22,10 +22,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
-from tensorflow.contrib.losses.python import losses
+# pylint: disable=wildcard-import
 from tensorflow.contrib.losses.python.losses import *
-# pylint: enable=unused-import,wildcard-import
+# pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__, doc_string_modules=[losses])
+
+_allowed_symbols = [
+    'absolute_difference',
+    'add_loss',
+    'hinge_loss',
+    'compute_weighted_loss',
+    'cosine_distance',
+    'get_losses',
+    'get_regularization_losses',
+    'get_total_loss',
+    'log_loss',
+    'mean_pairwise_squared_error',
+    'mean_squared_error',
+    'sigmoid_cross_entropy',
+    'softmax_cross_entropy',
+    'sparse_softmax_cross_entropy',
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/losses/python/losses/__init__.py b/tensorflow/contrib/losses/python/losses/__init__.py
index 1b57f0baeef0c1e7016dcd95c725ee88566b1a9d..6e9d1d4a773b3a2c9b7b1accbb3ccb3000c8164a 100644
--- a/tensorflow/contrib/losses/python/losses/__init__.py
+++ b/tensorflow/contrib/losses/python/losses/__init__.py
@@ -12,127 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""## Loss operations for use in neural networks.
+"""Ops for building neural network losses.
 
-Note: By default all the losses are collected into the `GraphKeys.LOSSES`
-collection.
-
-All of the loss functions take a pair of predictions and ground truth labels,
-from which the loss is computed. It is assumed that the shape of both these
-tensors is of the form [batch_size, d1, ... dN] where `batch_size` is the number
-of samples in the batch and `d1` ... `dN` are the remaining dimensions.
-
-It is common, when training with multiple loss functions, to adjust the relative
-strengths of individual losses. This is performed by rescaling the losses via
-a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and sum_of_squares_loss, and we wished that the
-log_loss penalty be twice as severe as the sum_of_squares_loss, we would
-implement this as:
-
-  # Explicitely set the weight.
-  tf.contrib.losses.log(predictions, labels, weight=2.0)
-
-  # Uses default weight of 1.0
-  tf.contrib.losses.sum_of_squares(predictions, labels)
-
-  # All the losses are collected into the `GraphKeys.LOSSES` collection.
-  losses = tf.get_collection(tf.GraphKeys.LOSSES)
-
-While specifying a scalar loss rescales the loss over the entire batch,
-we sometimes want to rescale the loss per batch sample. For example, if we have
-certain examples that matter more to us to get correctly, we might want to have
-a higher loss that other samples whose mistakes matter less. In this case, we
-can provide a weight vector of length `batch_size` which results in the loss
-for each sample in the batch being scaled by the corresponding weight element.
-For example, consider the case of a classification problem where we want to
-maximize our accuracy but we especially interested in obtaining high accuracy
-for a specific class:
-
-  inputs, labels = LoadData(batch_size=3)
-  logits = MyModelPredictions(inputs)
-
-  # Ensures that the loss for examples whose ground truth class is `3` is 5x
-  # higher than the loss for all other examples.
-  weight = tf.multiply(4, tf.cast(tf.equal(labels, 3), tf.float32)) + 1
-
-  onehot_labels = tf.one_hot(labels, num_classes=5)
-  tf.contrib.losses.softmax_cross_entropy(logits, onehot_labels, weight=weight)
-
-Finally, in certain cases, we may want to specify a different loss for every
-single measurable value. For example, if we are performing per-pixel depth
-prediction, or per-pixel denoising, a single batch sample has P values where P
-is the number of pixels in the image. For many losses, the number of measurable
-values matches the number of elements in the predictions and labels tensors.
-For others, such as softmax_cross_entropy and cosine_distance, the
-loss functions reduces the dimensions of the inputs to produces a tensor of
-losses for each measurable value. For example, softmax_cross_entropy takes as
-input predictions and labels of dimension [batch_size, num_classes] but the
-number of measurable values is [batch_size]. Consequently, when passing a weight
-tensor to specify a different loss for every measurable value, the dimension of
-the tensor will depend on the loss being used.
-
-For a concrete example, consider the case of per-pixel depth prediction where
-certain ground truth depth values are missing (due to sensor noise in the
-capture process). In this case, we want to assign zero weight to losses for
-these predictions.
-
-  # 'depths' that are missing have a value of 0:
-  images, depths = LoadData(...)
-  predictions = MyModelPredictions(images)
-
-  weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
-
-Note that when using weights for the losses, the final average is computed
-by rescaling the losses by the weights and then dividing by the total number of
-non-zero samples. For an arbitrary set of weights, this may not necessarily
-produce a weighted average. Instead, it simply and transparently rescales the
-per-element losses before averaging over the number of observations. For example
-if the losses computed by the loss function is an array [4, 1, 2, 3] and the
-weights are an array [1, 0.5, 3, 9], then the average loss is:
-
-  (4*1 + 1*0.5 + 2*3 + 3*9) / 4
-
-However, with a single loss function and an arbitrary set of weights, one can
-still easily create a loss function such that the resulting loss is a
-weighted average over the individual prediction errors:
-
-  images, labels = LoadData(...)
-  predictions = MyModelPredictions(images)
-
-  weight = MyComplicatedWeightingFunction(labels)
-  weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
-
-@@absolute_difference
-@@add_loss
-@@hinge_loss
-@@compute_weighted_loss
-@@cosine_distance
-@@get_losses
-@@get_regularization_losses
-@@get_total_loss
-@@log_loss
-@@mean_pairwise_squared_error
-@@mean_squared_error
-@@sigmoid_cross_entropy
-@@softmax_cross_entropy
-@@sparse_softmax_cross_entropy
-
-The following are deprecated in favor of `mean_pairwise_squared_error` and
-`mean_squared_error`.
-@@sum_of_pairwise_squares
-@@sum_of_squares
+See @{$python/contrib.losses}.
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
+# pylint: disable=wildcard-import
 from tensorflow.contrib.losses.python.losses.loss_ops import *
-from tensorflow.python.util.all_util import make_all
-# pylint: enable=unused-import,wildcard-import
-
-__all__ = make_all(__name__)
+# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 5ca8c8a18bc08f06e3afdf97d11bdf44f72b1a9b..f6d3601c7dc6002673a7d056313939bf99cbaa44 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -236,7 +236,7 @@ def get_regularization_losses(scope=None):
     scope: an optional scope for filtering the losses to return.
 
   Returns:
-    A list of loss variables.
+    A list of regularization losses as Tensors.
   """
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 2b2e885689d63aa6bb9b7856f24625ebd0e303a4..305ed0d11ec11ef24971a47f6b4d7f3bb25f82b2 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -281,6 +281,10 @@ ifeq ($(TARGET),ANDROID)
 		CXXFLAGS += -DUSE_HEXAGON_LIBS
 	endif
 
+	ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
+		CXXFLAGS += -DENABLE_EXPERIMENTAL_HEXNN_OPS
+	endif
+
 endif  # ANDROID
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
@@ -293,7 +297,7 @@ ifeq ($(TARGET),IOS)
 	IPHONESIMULATOR_SYSROOT := $(shell xcrun --sdk iphonesimulator \
 	--show-sdk-path)
 	IOS_SDK_VERSION := $(shell xcrun --sdk iphoneos --show-sdk-version)
-	MIN_SDK_VERSION := 8.2
+	MIN_SDK_VERSION := 8.0
 # Override IOS_ARCH with ARMV7, ARMV7S, ARM64, or I386.
 	IOS_ARCH := X86_64
 	ifeq ($(IOS_ARCH),ARMV7)
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index f061b58775e87a56c745b0cfb1c7c4bc5cbcde4c..9ba5c035a269e4a76a7f6214394c6577ed6a6471 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -295,7 +295,7 @@ itself, you'll see it's broken up into host and target sections. If you are
 cross-compiling, you should look at customizing the target settings to match
 what you need for your desired system.
 
-## Dependency Managment
+## Dependency Management
 
 The Makefile loads in a list of dependencies stored in text files. These files
 are generated from the main Bazel build by running 
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 2f3649dac3a958534715bdbe5bf8e4983a835f16..161f2df5b27044971c6fd7e13c321c95e0ab4d02 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -19,6 +19,7 @@ set -e
 
 usage() {
   echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-s:t:Tx:X]"
+  echo "-E enable experimental hexnn ops"
   echo "-s [sub_makefiles] sub makefiles separated by white space"
   echo "-t [build_target] build target for Android makefile [default=all]"
   echo "-T only build tensorflow"
@@ -31,8 +32,9 @@ if [[ -z "${NDK_ROOT}" ]]; then
     exit 1
 fi
 
-while getopts "s:t:Tx:" opt_name; do
+while getopts "Es:t:Tx:" opt_name; do
   case "$opt_name" in
+    E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
     s) SUB_MAKEFILES="${OPTARG}";;
     t) BUILD_TARGET="${OPTARG}";;
     T) ONLY_MAKE_TENSORFLOW="true";;
@@ -83,16 +85,20 @@ if [[ "${USE_HEXAGON}" == "true" ]]; then
     HEXAGON_INCLUDE=$(cd "tensorflow/core/platform/hexagon" >/dev/null && pwd)
 fi
 
+if [[ "${ENABLE_EXPERIMENTAL_HEXNN_OPS}" == "true" ]]; then
+    EXTRA_MAKE_ARGS+=("ENABLE_EXPERIMENTAL_HEXNN_OPS=true")
+fi
+
 if [[ -z "${BUILD_TARGET}" ]]; then
     make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
          TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" CC_PREFIX="${CC_PREFIX}" \
 HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \
-SUB_MAKEFILES="${SUB_MAKEFILES}"
+SUB_MAKEFILES="${SUB_MAKEFILES}" ${EXTRA_MAKE_ARGS[@]}
 else
     # BUILD_TARGET explicitly uncommented to allow multiple targets to be
     # passed to make in a single build_all_android.sh invocation.
     make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
          TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" CC_PREFIX="${CC_PREFIX}" \
 HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \
-SUB_MAKEFILES="${SUB_MAKEFILES}" ${BUILD_TARGET}
+SUB_MAKEFILES="${SUB_MAKEFILES}" ${EXTRA_MAKE_ARGS[@]} ${BUILD_TARGET}
 fi
diff --git a/tensorflow/contrib/makefile/build_helper.subr b/tensorflow/contrib/makefile/build_helper.subr
index f0452944e23d308b86ad382995a296b6055e0b7d..d58b2c0a9be80da28159e869b0ed2c331e2f0191 100644
--- a/tensorflow/contrib/makefile/build_helper.subr
+++ b/tensorflow/contrib/makefile/build_helper.subr
@@ -31,7 +31,7 @@ get_cpu_count() {
 }
 
 get_job_count() {
-  echo $(($(get_cpu_count) * 2))
+  echo $(($(get_cpu_count)))
 }
 
 make_host_protoc() {
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index 12f34b38d0875864e0514ce51afab197a4ab8a50..d1012a6c9351eddcc306fb5261e872230aaf6deb 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -41,7 +41,7 @@ IPHONEOS_SYSROOT=`xcrun --sdk iphoneos --show-sdk-path`
 IPHONESIMULATOR_PLATFORM=`xcrun --sdk iphonesimulator --show-sdk-platform-path`
 IPHONESIMULATOR_SYSROOT=`xcrun --sdk iphonesimulator --show-sdk-path`
 IOS_SDK_VERSION=`xcrun --sdk iphoneos --show-sdk-version`
-MIN_SDK_VERSION=8.2
+MIN_SDK_VERSION=8.0
 
 CFLAGS="-DNDEBUG -Os -pipe -fPIC -fno-exceptions"
 CXXFLAGS="${CFLAGS} -std=c++11 -stdlib=libc++"
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 58d1dc6f0afd22d894e2bfb5fd9dc2bf5554f953..f123111df84fa59e8c0da94329f8c0103f88a0ed 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -30,9 +30,13 @@ RE2_URL="$(grep -o 'http.*github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" |
 replace_by_sed() {
   local regex="${1}"
   shift
-  if echo "${OSTYPE}" | grep -q darwin; then
+  # Detect the version of sed by the return value of "--version" flag. GNU-sed
+  # supports "--version" while BSD-sed doesn't.
+  if ! sed --version >/dev/null 2>&1; then
+    # BSD-sed.
     sed -i '' -e "${regex}" "$@"
   else
+    # GNU-sed.
     sed -i -e "${regex}" "$@"
   fi
 }
diff --git a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
index c0969e6dee2553a46fb98d6c1327f2629a64c7fd..5ade8942af39f1d308c5f6e308e1cee754510926 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
@@ -7,9 +7,11 @@ tensorflow/core/protobuf/saver.pb.cc
 tensorflow/core/protobuf/queue_runner.pb.cc
 tensorflow/core/protobuf/named_tensor.pb.cc
 tensorflow/core/protobuf/meta_graph.pb.cc
+tensorflow/core/protobuf/cluster.pb.cc
 tensorflow/core/protobuf/config.pb.cc
 tensorflow/core/protobuf/rewriter_config.pb.cc
 tensorflow/core/protobuf/debug.pb.cc
+tensorflow/core/protobuf/device_properties.pb.cc
 tensorflow/core/lib/core/error_codes.pb.cc
 tensorflow/core/framework/versions.pb.cc
 tensorflow/core/framework/variable.pb.cc
@@ -35,3 +37,4 @@ tensorflow/core/framework/attr_value.pb.cc
 tensorflow/core/framework/allocation_description.pb.cc
 tensorflow/core/example/feature.pb.cc
 tensorflow/core/example/example.pb.cc
+tensorflow/core/grappler/costs/op_performance_data.pb.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
index 132b4775962aaef478f90bd254702015ba498cd6..1f0ad06cdc5b98ae9c08ea63dad70eb02b6ef46b 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@@ -7,8 +7,10 @@ tensorflow/core/protobuf/saver.pb.h
 tensorflow/core/protobuf/queue_runner.pb.h
 tensorflow/core/protobuf/named_tensor.pb.h
 tensorflow/core/protobuf/meta_graph.pb.h
+tensorflow/core/protobuf/cluster.pb.h
 tensorflow/core/protobuf/config.pb.h
 tensorflow/core/protobuf/debug.pb.h
+tensorflow/core/protobuf/device_properties.pb.h
 tensorflow/core/protobuf/rewriter_config.pb.h
 tensorflow/core/protobuf/tensor_bundle.pb.h
 tensorflow/core/lib/core/error_codes.pb.h
@@ -36,3 +38,4 @@ tensorflow/core/framework/attr_value.pb.h
 tensorflow/core/framework/allocation_description.pb.h
 tensorflow/core/example/feature.pb.h
 tensorflow/core/example/example.pb.h
+tensorflow/core/grappler/costs/op_performance_data.pb.h
diff --git a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
index 8ec2c8d034ee1d0929d4ed74745699ac13150374..861bb885c7031b996b48dbc50887cfce55c638f3 100755
--- a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+++ b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
@@ -22,6 +22,7 @@ usage() {
   echo "Optional: NNLIB_DIR=<path to downloaded nnlib dir>"
   echo "-b build only"
   echo "-c test count"
+  echo "-E enable experimental hexnn ops"
   echo "-p use prebuilt hexagon binaries"
   echo "-s skip download if files already exist"
   exit 1
@@ -30,10 +31,11 @@ usage() {
 TEST_COUNT=1
 SKIP_DOWNLOAD_IF_EXIST=false
 
-while getopts "bc:ps" opt_name; do
+while getopts "bc:Eps" opt_name; do
   case "$opt_name" in
-    c) TEST_COUNT="${OPTARG}";;
     b) BUILD_ONLY="true";;
+    c) TEST_COUNT="${OPTARG}";;
+    E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
     p) USE_PREBUILT_HEXAOGON_BINARIES="true";;
     s) SKIP_DOWNLOAD_IF_EXIST="true";;
     *) usage;;
@@ -158,7 +160,11 @@ fi
 if [[ -d "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/protobuf" &&
       -d "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/protobuf-host" ]]; then
     echo "generated protobuf and protobuf-host found."
-    extra_args+=("-T")
+    EXTRA_ARGS+=("-T")
+fi
+
+if [[ "${ENABLE_EXPERIMENTAL_HEXNN_OPS}" == "true" ]]; then
+    EXTRA_ARGS+=("-E")
 fi
 
 if [[ -z "${CC_PREFIX}" ]]; then
@@ -168,7 +174,7 @@ fi
 CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -x "${GEN_LIBS_DIR}" \
 -s "${TF_ROOT_DIR}/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in" \
--t "hexagon_graph_execution" ${extra_args[@]}
+-t "hexagon_graph_execution" ${EXTRA_ARGS[@]}
 
 echo "Download and push inception image"
 HEXAGON_DOWNLOAD_PATH=\
diff --git a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
index ccbbfa41324657cc7a9b812339fa5956270d36e3..2a6f66edcb72f10fe44ff1b8351bedca6a72d52e 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
@@ -47,7 +47,6 @@ GRAPH_TRANSFER_SRCS := \
 tensorflow/cc/framework/scope.cc \
 tensorflow/cc/framework/ops.cc \
 tensorflow/cc/ops/const_op.cc \
-tensorflow/core/kernels/function_ops.cc \
 tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \
 tensorflow/core/kernels/hexagon/graph_transferer.cc \
 tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 974d3f3287a9e2f8aeae988ebc00ee8348fc1e0c..2143f3b925207558a0a26aa42a95573cc8b0a8fd 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -4,6 +4,7 @@ tensorflow/core/kernels/variable_ops.cc
 tensorflow/core/kernels/unpack_op.cc
 tensorflow/core/kernels/transpose_op.cc
 tensorflow/core/kernels/transpose_functor_cpu.cc
+tensorflow/core/kernels/training_op_helpers.cc
 tensorflow/core/kernels/training_ops.cc
 tensorflow/core/kernels/topk_op.cc
 tensorflow/core/kernels/tile_ops.cc
@@ -73,6 +74,7 @@ tensorflow/core/kernels/reduction_ops_mean.cc
 tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/reduction_ops_any.cc
+tensorflow/core/kernels/reduction_ops_all.cc
 tensorflow/core/kernels/queue_ops.cc
 tensorflow/core/kernels/queue_base.cc
 tensorflow/core/kernels/pooling_ops_common.cc
@@ -137,12 +139,15 @@ tensorflow/core/kernels/cwise_op_less.cc
 tensorflow/core/kernels/cwise_op_isfinite.cc
 tensorflow/core/kernels/cwise_op_greater_equal.cc
 tensorflow/core/kernels/cwise_op_greater.cc
+tensorflow/core/kernels/cwise_op_floor_div.cc
+tensorflow/core/kernels/cwise_op_floor.cc
 tensorflow/core/kernels/cwise_op_exp.cc
 tensorflow/core/kernels/cwise_op_equal_to_2.cc
 tensorflow/core/kernels/cwise_op_equal_to_1.cc
 tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_add_2.cc
 tensorflow/core/kernels/cwise_op_add_1.cc
+tensorflow/core/kernels/cwise_op_abs.cc
 tensorflow/core/kernels/ctc_decoder_ops.cc
 tensorflow/core/kernels/crop_and_resize_op.cc
 tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -224,4 +229,4 @@ tensorflow/core/ops/array_grad.cc
 tensorflow/core/kernels/spacetobatch_functor.cc
 tensorflow/core/kernels/spacetobatch_op.cc
 tensorflow/core/kernels/batchtospace_op.cc
-
+tensorflow/core/kernels/warn_about_ints.cc
diff --git a/tensorflow/contrib/makefile/tf_pb_text_files.txt b/tensorflow/contrib/makefile/tf_pb_text_files.txt
index f1da05e4c6ec197ba5e4a8fe0296de5464e3fe92..c39257ffa91fef184e8bd5258b19c4323a1b7fe0 100644
--- a/tensorflow/contrib/makefile/tf_pb_text_files.txt
+++ b/tensorflow/contrib/makefile/tf_pb_text_files.txt
@@ -1,6 +1,7 @@
 tensorflow/core/util/saved_tensor_slice.pb_text.cc
 tensorflow/core/util/memmapped_file_system.pb_text.cc
 tensorflow/core/protobuf/saver.pb_text.cc
+tensorflow/core/protobuf/cluster.pb_text.cc
 tensorflow/core/protobuf/config.pb_text.cc
 tensorflow/core/protobuf/debug.pb_text.cc
 tensorflow/core/protobuf/rewriter_config.pb_text.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 2a78ea610166410c8b4a899786c5b021ddebdba3..36d9cb74a704172a44e77952d021cab671806b03 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -7,8 +7,10 @@ tensorflow/core/protobuf/saver.proto
 tensorflow/core/protobuf/queue_runner.proto
 tensorflow/core/protobuf/named_tensor.proto
 tensorflow/core/protobuf/meta_graph.proto
+tensorflow/core/protobuf/cluster.proto
 tensorflow/core/protobuf/config.proto
 tensorflow/core/protobuf/debug.proto
+tensorflow/core/protobuf/device_properties.proto
 tensorflow/core/protobuf/rewriter_config.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/lib/core/error_codes.proto
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index d57203c042de0ca0a71540a77f97b925d838db9b..727cdd9597a6267702f705497ac6af6819a51e6a 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -1338,6 +1338,87 @@ def streaming_sparse_precision_at_top_k(top_k_predictions,
         name=name_scope)
 
 
+def sparse_recall_at_top_k(labels,
+                           top_k_predictions,
+                           class_id=None,
+                           weights=None,
+                           metrics_collections=None,
+                           updates_collections=None,
+                           name=None):
+  """Computes recall@k of top-k predictions with respect to sparse labels.
+
+  If `class_id` is specified, we calculate recall by considering only the
+      entries in the batch for which `class_id` is in the label, and computing
+      the fraction of them for which `class_id` is in the top-k `predictions`.
+  If `class_id` is not specified, we'll calculate recall as how often on
+      average a class among the labels of a batch entry is in the top-k
+      `predictions`.
+
+  `sparse_recall_at_top_k` creates two local variables, `true_positive_at_<k>`
+  and `false_negative_at_<k>`, that are used to compute the recall_at_k
+  frequency. This frequency is ultimately returned as `recall_at_<k>`: an
+  idempotent operation that simply divides `true_positive_at_<k>` by total
+  (`true_positive_at_<k>` + `false_negative_at_<k>`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `recall_at_<k>`. Set operations applied to `top_k` and `labels` calculate the
+  true positives and false negatives weighted by `weights`. Then `update_op`
+  increments `true_positive_at_<k>` and `false_negative_at_<k>` using these
+  values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `top_k_predictions`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range always count towards `false_negative_at_<k>`.
+    top_k_predictions: Integer `Tensor` with shape [D1, ... DN, k] where
+      N >= 1. Commonly, N=1 and top_k_predictions has shape [batch size, k].
+      The final dimension contains the indices of top-k labels. [D1, ... DN]
+      must match `labels`.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If class_id is outside this range, the method returns NAN.
+    weights: `Tensor` whose rank is either 0, or n-1, where n is the rank of
+      `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
+      dimensions must be either `1`, or the same as the corresponding `labels`
+      dimension).
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately, and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+  """
+  default_name = _at_k_name('recall', class_id=class_id)
+  with ops.name_scope(name, default_name, (top_k_predictions, labels,
+                                           weights)) as name_scope:
+    return metrics_impl._sparse_recall_at_top_k(  # pylint: disable=protected-access
+        labels=labels,
+        predictions_idx=top_k_predictions,
+        class_id=class_id,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=name_scope)
+
+
 def streaming_sparse_average_precision_at_k(predictions,
                                             labels,
                                             k,
@@ -2288,6 +2369,7 @@ def _remove_squeezable_dimensions(predictions, labels, weights):
 __all__ = [
     'aggregate_metric_map',
     'aggregate_metrics',
+    'sparse_recall_at_top_k',
     'streaming_accuracy',
     'streaming_auc',
     'streaming_false_negatives',
@@ -2310,7 +2392,9 @@ __all__ = [
     'streaming_root_mean_squared_error',
     'streaming_sensitivity_at_specificity',
     'streaming_sparse_average_precision_at_k',
+    'streaming_sparse_average_precision_at_top_k',
     'streaming_sparse_precision_at_k',
+    'streaming_sparse_precision_at_top_k',
     'streaming_sparse_recall_at_k',
     'streaming_specificity_at_sensitivity',
     'streaming_true_negatives',
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index b960e1310ecc2a1dfd18beef56204d0f60893126..f42e974e238c4a44a45326a915ee4d2aa9b83bfe 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -2958,8 +2958,38 @@ class StreamingSparseRecallTest(test.TestCase):
         self.assertEqual(expected, update.eval())
         self.assertEqual(expected, metric.eval())
 
+  def _test_sparse_recall_at_top_k(self,
+                                   labels,
+                                   top_k_predictions,
+                                   expected,
+                                   class_id=None,
+                                   weights=None):
+    with ops.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = constant_op.constant(weights, dtypes_lib.float32)
+      metric, update = metric_ops.sparse_recall_at_top_k(
+          labels=labels,
+          top_k_predictions=constant_op.constant(top_k_predictions,
+                                                 dtypes_lib.int32),
+          class_id=class_id,
+          weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(errors_impl.OpError, metric.eval)
+      self.assertRaises(errors_impl.OpError, update.eval)
+      variables.variables_initializer(variables.local_variables()).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        self.assertTrue(math.isnan(update.eval()))
+        self.assertTrue(math.isnan(metric.eval()))
+      else:
+        self.assertEqual(expected, update.eval())
+        self.assertEqual(expected, metric.eval())
+
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2970,9 +3000,12 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (-1, 0, 1, 4):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=1, expected=NAN, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=NAN, class_id=class_id)
 
   def test_one_label_at_k1_no_predictions(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2981,9 +3014,12 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 2: 0 predictions.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.0, class_id=2)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0, class_id=2)
 
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2992,13 +3028,18 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 3: 1 label, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 1, class_id=3)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 1, class_id=3)
 
       # All classes: 2 labels, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2)
 
   def test_one_label_at_k1_weighted(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -3007,6 +3048,8 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 3: 1 label, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=NAN, class_id=3, weights=(0.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=NAN, class_id=3, weights=(0.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3014,6 +3057,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(1.0,))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(1.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3021,6 +3070,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(2.0,))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(2.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3028,6 +3083,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=3,
           weights=(0.0, 0.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=3,
+          weights=(0.0, 0.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3035,6 +3096,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=3,
           weights=(0.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=3,
+          weights=(0.0, 1.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3042,6 +3109,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(1.0, 0.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(1.0, 0.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3049,6 +3122,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(1.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(1.0, 1.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3056,6 +3135,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=2.0 / 2,
           class_id=3,
           weights=(2.0, 3.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=2.0 / 2,
+          class_id=3,
+          weights=(2.0, 3.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3063,6 +3148,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=3.0 / 3,
           class_id=3,
           weights=(3.0, 2.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=3.0 / 3,
+          class_id=3,
+          weights=(3.0, 2.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3070,6 +3161,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=0.3 / 0.3,
           class_id=3,
           weights=(0.3, 0.6))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=0.3 / 0.3,
+          class_id=3,
+          weights=(0.3, 0.6))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3077,32 +3174,70 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=0.6 / 0.6,
           class_id=3,
           weights=(0.6, 0.3))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=0.6 / 0.6,
+          class_id=3,
+          weights=(0.6, 0.3))
 
       # All classes: 2 labels, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=NAN, weights=(0.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=NAN, weights=(0.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(1.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2, weights=(1.0,))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(2.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2, weights=(2.0,))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 1, weights=(1.0, 0.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 1, weights=(1.0, 0.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.0 / 1, weights=(0.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0 / 1, weights=(0.0, 1.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(1.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2, weights=(1.0, 1.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=2.0 / 5, weights=(2.0, 3.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=2.0 / 5, weights=(2.0, 3.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=3.0 / 5, weights=(3.0, 2.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=3.0 / 5, weights=(3.0, 2.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.3 / 0.9, weights=(0.3, 0.6))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.3 / 0.9, weights=(0.3, 0.6))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.6 / 0.9, weights=(0.6, 0.3))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.6 / 0.9, weights=(0.6, 0.3))
 
   def test_three_labels_at_k5_nan(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
     dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
@@ -3112,10 +3247,16 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (0, 3, 4, 6, 9, 10):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=5, expected=NAN, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=NAN, class_id=class_id)
 
   def test_three_labels_at_k5_no_predictions(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
     dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
@@ -3124,10 +3265,16 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 8: 1 label, no predictions.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=0.0 / 1, class_id=8)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0 / 1, class_id=8)
 
   def test_three_labels_at_k5(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
     dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
@@ -3136,23 +3283,35 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 2: 2 labels, both correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=2.0 / 2, class_id=2)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=2.0 / 2, class_id=2)
 
       # Class 5: 1 label, incorrect.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=1.0 / 1, class_id=5)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 1, class_id=5)
 
       # Class 7: 1 label, incorrect.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=0.0 / 1, class_id=7)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0 / 1, class_id=7)
 
       # All classes: 6 labels, 3 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=3.0 / 6)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=3.0 / 6)
 
   def test_three_labels_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) count in denominator."""
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sp_labels = sparse_tensor.SparseTensorValue(
         indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2],
                  [1, 3]],
@@ -3167,6 +3326,11 @@ class StreamingSparseRecallTest(test.TestCase):
         k=5,
         expected=2.0 / 2,
         class_id=2)
+    self._test_sparse_recall_at_top_k(
+        sp_labels,
+        top_k_predictions,
+        expected=2.0 / 2,
+        class_id=2)
 
     # Class 5: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -3175,6 +3339,11 @@ class StreamingSparseRecallTest(test.TestCase):
         k=5,
         expected=1.0 / 1,
         class_id=5)
+    self._test_sparse_recall_at_top_k(
+        sp_labels,
+        top_k_predictions,
+        expected=1.0 / 1,
+        class_id=5)
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -3183,16 +3352,30 @@ class StreamingSparseRecallTest(test.TestCase):
         k=5,
         expected=0.0 / 1,
         class_id=7)
+    self._test_sparse_recall_at_top_k(
+        sp_labels,
+        top_k_predictions,
+        expected=0.0 / 1,
+        class_id=7)
 
     # All classes: 8 labels, 3 correct.
     self._test_streaming_sparse_recall_at_k(
         predictions=predictions, labels=sp_labels, k=5, expected=3.0 / 8)
+    self._test_sparse_recall_at_top_k(
+        sp_labels, top_k_predictions, expected=3.0 / 8)
 
   def test_3d_nan(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     sparse_labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
          [[0, 1, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]]])
@@ -3207,12 +3390,21 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (0, 3, 4, 6, 9, 10):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=5, expected=NAN, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=NAN, class_id=class_id)
 
   def test_3d_no_predictions(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     sparse_labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3229,12 +3421,21 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (1, 8):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=5, expected=0.0, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=0.0, class_id=class_id)
 
   def test_3d(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3244,24 +3445,39 @@ class StreamingSparseRecallTest(test.TestCase):
     # Class 2: 4 labels, all correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=4.0 / 4, class_id=2)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=4.0 / 4, class_id=2)
 
     # Class 5: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=2.0 / 2, class_id=5)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=2.0 / 2, class_id=5)
 
     # Class 7: 2 labels, 1 incorrect.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=1.0 / 2, class_id=7)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=1.0 / 2, class_id=7)
 
     # All classes: 12 labels, 7 correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=7.0 / 12)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=7.0 / 12)
 
   def test_3d_ignore_all(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3276,6 +3492,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=class_id,
           weights=[[0], [0]])
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=class_id,
+          weights=[[0], [0]])
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3283,16 +3505,33 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=class_id,
           weights=[[0, 0], [0, 0]])
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=class_id,
+          weights=[[0, 0], [0, 0]])
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=NAN, weights=[[0], [0]])
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=NAN, weights=[[0], [0]])
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=NAN, weights=[[0, 0], [0, 0]])
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=NAN, weights=[[0, 0], [0, 0]])
 
   def test_3d_ignore_some(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3307,6 +3546,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=2.0 / 2.0,
         class_id=2,
         weights=[[1], [0]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=2.0 / 2.0,
+        class_id=2,
+        weights=[[1], [0]])
 
     # Class 2: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3316,6 +3561,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=2.0 / 2.0,
         class_id=2,
         weights=[[0], [1]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=2.0 / 2.0,
+        class_id=2,
+        weights=[[0], [1]])
 
     # Class 7: 1 label, correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3325,6 +3576,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=1.0 / 1.0,
         class_id=7,
         weights=[[0], [1]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=1.0 / 1.0,
+        class_id=7,
+        weights=[[0], [1]])
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -3334,6 +3591,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=0.0 / 1.0,
         class_id=7,
         weights=[[1], [0]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=0.0 / 1.0,
+        class_id=7,
+        weights=[[1], [0]])
 
     # Class 7: 2 labels, 1 correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3343,6 +3606,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=1.0 / 2.0,
         class_id=7,
         weights=[[1, 0], [1, 0]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=1.0 / 2.0,
+        class_id=7,
+        weights=[[1, 0], [1, 0]])
 
     # Class 7: No labels.
     self._test_streaming_sparse_recall_at_k(
@@ -3352,6 +3621,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=NAN,
         class_id=7,
         weights=[[0, 1], [0, 1]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=NAN,
+        class_id=7,
+        weights=[[0, 1], [0, 1]])
 
   def test_sparse_tensor_value(self):
     predictions = [[0.1, 0.3, 0.2, 0.4],
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index c7f32baa2d55f80276c31b63e92f4862394d2d69..b6314079e17ebc32bdc073ca59bcb9fb35512e23 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -1,10 +1,8 @@
 # Description:
 #   Wrap NVIDIA (https://github.com/NVIDIA/nccl) NCCL with tensorflow ops.
 #   APIs are meant to change over time.
-package(
-    default_visibility = ["//visibility:private"],
-    features = ["-parse_headers"],
-)
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -19,6 +17,8 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_library(
     name = "python/ops/_nccl_ops.so",
@@ -34,6 +34,47 @@ tf_custom_op_library(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "nccl_manager_test",
+    size = "medium",
+    srcs = if_cuda(
+        [
+            "kernels/nccl_manager.cc",
+            "kernels/nccl_manager.h",
+            "kernels/nccl_manager_test.cc",
+        ],
+        [],
+    ),
+    tags = ["manual"],  # Disabled until errors finding nvmlShutdown are found.
+    deps = if_cuda(
+        [
+            "@nccl_archive//:nccl",
+            "//tensorflow/core",
+            "//tensorflow/core:cuda",
+        ],
+        [],
+    ) + [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_kernel_library(
+    name = "nccl_kernels",
+    srcs = [
+        "kernels/nccl_manager.cc",
+        "kernels/nccl_manager.h",
+        "kernels/nccl_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_headers_lib",
+        "@nccl_archive//:nccl",
+    ],
+    alwayslink = 1,
+)
+
 tf_gen_op_libs(
     op_lib_names = ["nccl_ops"],
     deps = [
@@ -46,15 +87,19 @@ tf_gen_op_wrapper_py(
     deps = [":nccl_ops_op_lib"],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "nccl_py",
     srcs = [
         "__init__.py",
         "python/ops/nccl_ops.py",
     ],
-    data = [
+    dso = [
         ":python/ops/_nccl_ops.so",
     ],
+    kernels = [
+        ":nccl_kernels",
+        ":nccl_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -82,32 +127,6 @@ cuda_py_test(
     ],
 )
 
-tf_cuda_cc_test(
-    name = "nccl_manager_test",
-    size = "medium",
-    srcs = if_cuda(
-        [
-            "kernels/nccl_manager.cc",
-            "kernels/nccl_manager.h",
-            "kernels/nccl_manager_test.cc",
-        ],
-        [],
-    ),
-    tags = ["manual"],  # Disabled until errors finding nvmlShutdown are found.
-    deps = if_cuda(
-        [
-            "@nccl_archive//:nccl",
-            "//tensorflow/core",
-            "//tensorflow/core:cuda",
-        ],
-        [],
-    ) + [
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index dfdfbc8eeab6c48642e7c8f19d7235150bbc6443..b289c91bb8ab3cfddcc1c0e25e0dccf4295fa160 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 
+#include <utility>
+
 #ifdef GOOGLE_CUDA
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -287,7 +289,7 @@ void NcclManager::AddBroadcastSend(
     const Tensor* in_t, DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
-                      executor, gpu_device_id, done_callback));
+                      executor, gpu_device_id, std::move(done_callback)));
   participant->root = true;
   AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
                  kBroadcast, ncclSum /* unused */);
@@ -300,7 +302,7 @@ void NcclManager::AddBroadcastRecv(
     Tensor* out_t, DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(nullptr /* in_t */, out_t, event_mgr, tensor_stream,
-                      executor, gpu_device_id, done_callback));
+                      executor, gpu_device_id, std::move(done_callback)));
   AddParticipant(num_devices, key, std::move(participant), out_t->dtype(),
                  kBroadcast, ncclSum /* unused */);
 }
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index d50ee906fb3171f4899941e964d08d9a726de42e..edc9c0ef333497dbfd5bf841b09523802617984f 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -8,15 +8,18 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
         "python/training/delay_compensated_gradient_descent.py",
+        "python/training/drop_stale_gradient_optimizer.py",
         "python/training/external_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/moving_average_optimizer.py",
+        "python/training/nadam_optimizer.py",
         "python/training/variable_clipping_optimizer.py",
     ],
     srcs_version = "PY2AND3",
@@ -122,6 +125,39 @@ py_test(
     ],
 )
 
+py_test(
+    name = "nadam_optimizer_test",
+    srcs = ["python/training/nadam_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "drop_stale_gradient_optimizer_test",
+    srcs = ["python/training/drop_stale_gradient_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 036e7057994dca45e287703a985a988a5e9dab13..a12afb12a0b742fa1e1c817cf7d61289cb717887 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -20,17 +20,22 @@ from __future__ import print_function
 
 # pylint: disable=wildcard-import
 from tensorflow.contrib.opt.python.training.delay_compensated_gradient_descent import *
+from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
+from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
+
 _allowed_symbols = ['DelayCompensatedGradientDescentOptimizer',
+                    'DropStaleGradientOptimizer',
                     'ExternalOptimizerInterface',
                     'LazyAdamOptimizer',
+                    'NadamOptimizer',
                     'MovingAverageOptimizer',
                     'ScipyOptimizerInterface',
                     'VariableClippingOptimizer']
diff --git a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f20c172ee376d0a808a21fe96bec80367bf2e9f4
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
@@ -0,0 +1,113 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper optimizer for checking and dropping stale gradients."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_util
+
+
+class DropStaleGradientOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that checks and drops stale gradient.
+
+  This optimizer records the global step for each worker before computing
+  gradients and compares it with the global step at the time of applying the
+  gradients. If the difference is larger than a threshold, it will drop all
+  the computed gradients.
+  """
+
+  def __init__(self,
+               opt,
+               staleness,
+               use_locking=False,
+               name="DropStaleGradient"):
+    """Constructs a new DropStaleGradientOptimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to compute and apply the
+           gradients. Must be one of the Optimizer classes.
+      staleness: The maximum staleness allowed for the optimizer.
+      use_locking: If `True` use locks for clip update operations.
+      name: Optional name prefix for the operations created when applying
+            gradients. Defaults to "DropStaleGradient".
+    """
+    super(DropStaleGradientOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._staleness = staleness
+
+  def compute_gradients(self, loss, *args, **kwargs):
+    # Record current global step for worker.
+    with ops.colocate_with(loss):
+      self._local_step = training_util.get_global_step() + 0
+
+    with ops.control_dependencies([self._local_step]):
+      loss = gen_array_ops.identity(loss)
+      return self._opt.compute_gradients(loss, *args, **kwargs)
+
+  def get_slot(self, *args, **kwargs):
+    return self._opt.get_slot(*args, **kwargs)
+
+  def get_slot_names(self, *args, **kwargs):
+    return self._opt.get_slot_names(*args, **kwargs)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    gradients = []
+    # Number of stale gradients.
+    stale_counter = variable_scope.get_variable(
+        "stale_counter", [],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False)
+
+    def _AcceptGradientOp():
+      with ops.control_dependencies(
+          [self._opt.apply_gradients(
+              grads_and_vars, global_step=global_step, name=name)]):
+        return gen_array_ops.identity(0.0)
+
+    def _DropGradientOp():
+      return gen_array_ops.identity(1.0)
+
+    for grad_and_var in grads_and_vars:
+      grad = grad_and_var[0]
+      if isinstance(grad, ops.Tensor):
+        gradients.append(grad)
+      elif grad is not None:
+        gradients.append(grad.op)
+
+    with ops.control_dependencies(gradients), ops.colocate_with(global_step):
+      staleness = gen_array_ops.reshape(
+          global_step - self._local_step, shape=())
+
+    conditional_update = stale_counter.assign_add(control_flow_ops.cond(
+        gen_math_ops.less_equal(staleness, self._staleness),
+        _AcceptGradientOp, _DropGradientOp))
+
+    summary.scalar(
+        "Gradient staleness percentage",
+        stale_counter / (math_ops.cast(global_step + 1, dtypes.float32)))
+    return conditional_update
diff --git a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..53232082e16fa76db0befb3cdc1e6579f998a7b5
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
@@ -0,0 +1,297 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DropStaleGradientOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+
+from tensorflow.contrib.opt.python.training import drop_stale_gradient_optimizer
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training_util
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+def _get_workers(num_workers, staleness):
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  cluster_dict = {
+      'worker': ['localhost:%s' % port for port in worker_ports],
+      'ps': ['localhost:%s' % portpicker.pick_unused_port()]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+  workers = [
+      server_lib.Server(
+          cs, job_name='worker', task_index=ix, start=True)
+      for ix in range(num_workers)
+  ]
+  server_lib.Server(cs, job_name='ps', task_index=0, start=True)
+
+  sessions = []
+  graphs = []
+  train_ops = []
+
+  # To simulate stale cases, maintaining two queues for computing and
+  # applying gradients respectively. In the phase of computing gradients,
+  # all workers except chief worker compute gradients together and chief worker
+  # computes after all other worers' computing finished. In the phase of
+  # applying gradients, chief worker will first apply gradients, then all other
+  # workers will apply gradients one by one. Therefore, the chief worker will
+  # always have 0 staleness, each of all other workers will have a unique
+  # staleness value from [1, num_workers).
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    with graph.as_default():
+      global_step = training_util.create_global_step()
+      var_0 = variables.Variable(0.0, name='v0')
+      var_1 = variables.Variable(1.0, name='v1')
+      compute_gradients_queue = data_flow_ops.FIFOQueue(
+          -1, global_step.dtype.base_dtype, shapes=(),
+          name='compute_gradients_queue', shared_name='compute_gradients_queue')
+      apply_gradients_queue = data_flow_ops.FIFOQueue(
+          -1, global_step.dtype.base_dtype, shapes=(),
+          name='apply_gradients_queue', shared_name='apply_gradients_queue')
+
+      # Gradients for loss on var_0 and var_1 will be 1.0.
+      loss = 0 - var_0 - var_1
+      sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+      stale_check_opt = (
+          drop_stale_gradient_optimizer.DropStaleGradientOptimizer(
+              sgd_opt, staleness))
+
+      # Compute gradients.
+      if worker_id == 0:
+        with ops.control_dependencies(
+            [compute_gradients_queue.dequeue_many(num_workers - 1)]):
+          grad_and_vars = stale_check_opt.compute_gradients(loss)
+      else:
+        grad_and_vars = stale_check_opt.compute_gradients(loss)
+        with ops.control_dependencies([t[0] for t in grad_and_vars]):
+          worker_enqueue_op = compute_gradients_queue.enqueue(global_step)
+
+      # Apply gradients.
+      if worker_id == 0:
+        with ops.control_dependencies(
+            [stale_check_opt.apply_gradients(grad_and_vars, global_step)]):
+          train_op = apply_gradients_queue.enqueue(global_step)
+      else:
+        with ops.control_dependencies([worker_enqueue_op]):
+          with ops.control_dependencies([apply_gradients_queue.dequeue()]):
+            with ops.control_dependencies(
+                [stale_check_opt.apply_gradients(
+                    grad_and_vars, global_step)]):
+              train_op = apply_gradients_queue.enqueue(global_step)
+
+      sess = session.Session(workers[worker_id].target)
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+
+  return sessions, graphs, train_ops
+
+
+class DropStaleGradientOptimizerTest(test.TestCase):
+
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Worker(self):
+    num_workers = 1
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    # Verify the updated value after 1 step.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def test1WorkerNegativeStaleness(self):
+    num_workers = 1
+    sessions, graphs, train_ops = _get_workers(num_workers, -1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    # Verify no updates because max staleness is negative.
+    self.assertAllEqual(0, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+
+  def test2WorkersStaleness0(self):
+    num_workers = 2
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    # With 2 workers and max staleness set to 0, only chief worker will update
+    # var_0 and var_1.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+
+  def test2WorkersStaleness1(self):
+    num_workers = 2
+    sessions, graphs, train_ops = _get_workers(num_workers, 1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    # With 2 workers and max staleness set to 1, both workers will update
+    # var_0 and var_1.
+    self.assertAllEqual(2, sessions[0].run(global_step))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
+
+  def test3WorkersStaleness0(self):
+    num_workers = 3
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_2 = self.checkedThread(
+        target=self._run, args=(train_ops[2], sessions[2]))
+    thread_0.start()
+    thread_1.start()
+    thread_2.start()
+    thread_0.join()
+    thread_1.join()
+    thread_2.join()
+
+    # With 3 workers and max staleness set to 0, only chief worker will update
+    # var_0 and var_1.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(2.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+
+  def test3WorkersStaleness1(self):
+    num_workers = 3
+    sessions, graphs, train_ops = _get_workers(num_workers, 1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_2 = self.checkedThread(
+        target=self._run, args=(train_ops[2], sessions[2]))
+    thread_0.start()
+    thread_1.start()
+    thread_2.start()
+    thread_0.join()
+    thread_1.join()
+    thread_2.join()
+
+    # With 3 workers and max staleness set to 1, chief worker and only one of
+    # the two other workers will update var_0 and var_1.
+    self.assertAllEqual(2, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..07521bd4ce8fd02960bc397fef287a43062c11af
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -0,0 +1,90 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Nadam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+from tensorflow.python.training import adam
+
+
+class NadamOptimizer(adam.AdamOptimizer):
+  """Optimizer that implements the Nadam algorithm.
+
+  See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+  """
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    return training_ops.apply_adam(
+        var, m, v,
+        math_ops.cast(self._beta1_power, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad, use_locking=self._use_locking,
+        use_nesterov=True).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    return training_ops.resource_apply_adam(
+        var.handle, m.handle, v.handle,
+        math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking,
+        use_nesterov=True)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
+    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta1_t)
+    m_t = state_ops.assign(m, m * beta1_t,
+                           use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+      # m_bar = (1 - beta1) * g_t + beta1 * m_t
+      m_bar = m_scaled_g_values + beta1_t * m_t
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(var,
+                                      lr * m_bar / (v_sqrt + epsilon_t),
+                                      use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d48684f53a5652fa3fa59f1715521bd16505d14
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -0,0 +1,158 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nadam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.contrib.opt.python.training import nadam_optimizer
+
+
+def nadam_update_numpy(param,
+                       g_t,
+                       t,
+                       m,
+                       v,
+                       alpha=0.001,
+                       beta1=0.9,
+                       beta2=0.999,
+                       epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  m_bar = (1 - beta1) * g_t + beta1 * m_t
+
+  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class NadamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = nadam_optimizer.NadamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Nadam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def doTestBasic(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = nadam_optimizer.NadamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Nadam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic(use_resource=False)
+
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index ab443eab6f6a63c68b2ffe66d8f0a87d24605bce..9d67563eddd47b19f404ed589002db2fe5de467f 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -304,6 +304,7 @@ filegroup(
         exclude = [
             "**/METADATA",
             "**/OWNERS",
+            "tools/**",
         ],
     ),
     visibility = ["//tensorflow:__subpackages__"],
@@ -351,3 +352,27 @@ tf_kernel_library(
         "//third_party/eigen3",
     ],
 )
+
+py_binary(
+    name = "checkpoint_convert",
+    srcs = ["python/tools/checkpoint_convert.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "checkpoint_convert_test",
+    size = "small",
+    srcs = ["python/tools/checkpoint_convert_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":checkpoint_convert",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index 24406e36cedbde86595463101e96997c5c06329a..2420c3e179b73ac52ad6222bb9944acbef156971 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -45,6 +45,8 @@ See @{$python/contrib.rnn} guide.
 @@UGRNNCell
 @@IntersectionRNNCell
 @@PhasedLSTMCell
+@@HighwayWrapper
+@@GLSTMCell
 
 ### RNNCell wrappers
 @@AttentionCellWrapper
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
index 7c4d003b833b39aded2f8002cf0a864331691e4f..544cd163c50062093acf7f5e942f67606936c0e3 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
@@ -37,7 +37,7 @@ class LSTMOpsTest : public ::testing::Test {
   }
 };
 
-static string JoinedCopies(string s, int copies) {
+static string JoinedCopies(const string& s, int copies) {
   string res;
   for (int i = 0; i < copies; ++i) {
     strings::StrAppend(&res, i > 0 ? ";" : "", s);
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 15afac982374b2171819e796166adbc9537c0110..89ad0fcd75329952d9b56d369f2ba1424e0930fb 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -74,7 +74,41 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 2])
-        g, _ = core_rnn_cell_impl.BasicRNNCell(2)(x, m)
+        cell = core_rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertEqual(
+            ["root/basic_rnn_cell/%s:0"
+             % core_rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+             "root/basic_rnn_cell/%s:0"
+             % core_rnn_cell_impl._BIAS_VARIABLE_NAME],
+            [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [g], {x.name: np.array([[1., 1.]]),
+                  m.name: np.array([[0.1, 0.1]])})
+        self.assertEqual(res[0].shape, (1, 2))
+
+  def testBasicRNNCellNotTrainable(self):
+    with self.test_session() as sess:
+      def not_trainable_getter(getter, *args, **kwargs):
+        kwargs["trainable"] = False
+        return getter(*args, **kwargs)
+
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5),
+          custom_getter=not_trainable_getter):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = core_rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertFalse(cell.trainable_variables)
+        self.assertEqual(
+            ["root/basic_rnn_cell/%s:0"
+             % core_rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+             "root/basic_rnn_cell/%s:0"
+             % core_rnn_cell_impl._BIAS_VARIABLE_NAME],
+            [v.name for v in cell.non_trainable_variables])
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g], {x.name: np.array([[1., 1.]]),
@@ -114,10 +148,23 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 8])
-        g, out_m = core_rnn_cell_impl.MultiRNNCell(
+        cell = core_rnn_cell_impl.MultiRNNCell(
             [core_rnn_cell_impl.BasicLSTMCell(
                 2, state_is_tuple=False) for _ in range(2)],
-            state_is_tuple=False)(x, m)
+            state_is_tuple=False)
+        g, out_m = cell(x, m)
+        expected_variable_names = [
+            "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0"
+            % core_rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0"
+            % core_rnn_cell_impl._BIAS_VARIABLE_NAME,
+            "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0"
+            % core_rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0"
+            % core_rnn_cell_impl._BIAS_VARIABLE_NAME]
+        self.assertEqual(
+            expected_variable_names, [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g, out_m],
@@ -125,15 +172,7 @@ class RNNCellTest(test.TestCase):
              m.name: 0.1 * np.ones([1, 8])})
         self.assertEqual(len(res), 2)
         variables = variables_lib.global_variables()
-        self.assertEqual(4, len(variables))
-        self.assertEquals(variables[0].op.name,
-                          "root/multi_rnn_cell/cell_0/basic_lstm_cell/weights")
-        self.assertEquals(variables[1].op.name,
-                          "root/multi_rnn_cell/cell_0/basic_lstm_cell/biases")
-        self.assertEquals(variables[2].op.name,
-                          "root/multi_rnn_cell/cell_1/basic_lstm_cell/weights")
-        self.assertEquals(variables[3].op.name,
-                          "root/multi_rnn_cell/cell_1/basic_lstm_cell/biases")
+        self.assertEqual(expected_variable_names, [v.name for v in variables])
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
         expected_mem = np.array([[
@@ -155,6 +194,44 @@ class RNNCellTest(test.TestCase):
              m.name: 0.1 * np.ones([1, 4])})
         self.assertEqual(len(res), 2)
 
+  def testBasicLSTMCellDimension0Error(self):
+    """Tests that dimension 0 in both(x and m) shape must be equal."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size - 1, state_size])
+        with self.assertRaises(ValueError):
+          g, out_m = core_rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run([g, out_m],
+                   {x.name: 1 * np.ones([batch_size, input_size]),
+               m.name: 0.1 * np.ones([batch_size - 1, state_size])})
+
+  def testBasicLSTMCellStateSizeError(self):
+    """Tests that state_size must be num_units * 2."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 3 # state_size must be num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        with self.assertRaises(ValueError):
+          g, out_m = core_rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run([g, out_m],
+                   {x.name: 1 * np.ones([batch_size, input_size]),
+                    m.name: 0.1 * np.ones([batch_size, state_size])})
+
   def testBasicLSTMCellStateTupleType(self):
     with self.test_session():
       with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 8b40fc068fe4ecbb2fb931382cc89b330e56893a..334baa5f9c5a65a81fe17359186003dadf738be3 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -849,14 +849,12 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_state_c = np.array(
-          [[2.954548e-01, 8.354891e-04],
-           [2.834632e-01, 8.158963e-01],
-           [2.291694e-01, 1.325745e-04]],
+          [[0.00072015, 0.00036633], [0.00083481, 0.00047266],
+           [0.00085111, 0.00053054]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[2.116566e-01, 5.985238e-04],
-           [2.137760e-01, 6.153145e-01],
-           [1.742966e-01, 1.008306e-04]],
+          [[0.0005159, 0.00026243], [0.00062958, 0.00035646],
+           [0.00064732, 0.00040351]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -882,6 +880,88 @@ class RNNCellTest(test.TestCase):
         self.assertAllClose(res[1].c, expected_state_c)
         self.assertAllClose(res[1].h, expected_state_h)
 
+  def testHighwayWrapper(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "base_cell", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 3])
+        base_cell = core_rnn_cell_impl.GRUCell(3)
+        g, m_new = base_cell(x, m)
+      with variable_scope.variable_scope(
+          "hw_cell", initializer=init_ops.constant_initializer(0.5)):
+        hw_cell = rnn_cell.HighwayWrapper(
+            core_rnn_cell_impl.GRUCell(3), carry_bias_init=-100.0)
+        g_res, m_new_res = hw_cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+      res = sess.run([g, g_res, m_new, m_new_res], {
+          x: np.array([[1., 1., 1.]]),
+          m: np.array([[0.1, 0.1, 0.1]])
+      })
+      # As carry_bias_init is very negative, the carry gate is 'open' and the
+      # transform gate is 'closed'. This means the output equals the input.
+      self.assertAllClose(res[1], res[0])
+      # States are left untouched
+      self.assertAllClose(res[2], res[3])
+
+  def testGLSTMCell(self):
+    # Ensure that G-LSTM matches LSTM when number_of_groups = 1
+    batch_size = 2
+    num_units = 4
+    number_of_groups = 1
+
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root1", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.ones([batch_size, num_units])
+        # When number_of_groups = 1, G-LSTM is equivalent to regular LSTM
+        gcell = rnn_cell.GLSTMCell(num_units=num_units,
+                                   number_of_groups=number_of_groups)
+        cell = core_rnn_cell_impl.LSTMCell(num_units=num_units)
+        self.assertTrue(isinstance(gcell.state_size, tuple))
+        zero_state = gcell.zero_state(batch_size=batch_size,
+                                      dtype=dtypes.float32)
+        gh, gs = gcell(x, zero_state)
+        h, g = cell(x, zero_state)
+
+        sess.run([variables.global_variables_initializer()])
+        glstm_result = sess.run([gh, gs])
+        lstm_result = sess.run([h, g])
+
+        self.assertAllClose(glstm_result[0], lstm_result[0], 1e-5)
+        self.assertAllClose(glstm_result[1], lstm_result[1], 1e-5)
+
+    # Test that G-LSTM subgroup act like corresponding sub-LSTMs
+    batch_size = 2
+    num_units = 4
+    number_of_groups = 2
+
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root2", initializer=init_ops.constant_initializer(0.5)):
+        # input for G-LSTM with 2 groups
+        glstm_input = array_ops.ones([batch_size, num_units])
+        gcell = rnn_cell.GLSTMCell(num_units=num_units,
+                                   number_of_groups=number_of_groups)
+        gcell_zero_state = gcell.zero_state(batch_size=batch_size,
+                                            dtype=dtypes.float32)
+        gh, gs = gcell(glstm_input, gcell_zero_state)
+
+        # input for LSTM cell simulating single G-LSTM group
+        lstm_input = array_ops.ones([batch_size, num_units / number_of_groups])
+        # note division by number_of_groups. This cell one simulates G-LSTM group
+        cell = core_rnn_cell_impl.LSTMCell(num_units=
+                                           int(num_units / number_of_groups))
+        cell_zero_state = cell.zero_state(batch_size=batch_size,
+                                          dtype=dtypes.float32)
+        h, g = cell(lstm_input, cell_zero_state)
+
+        sess.run([variables.global_variables_initializer()])
+        [gh_res, h_res] = sess.run([gh, h])
+        self.assertAllClose(gh_res[:, 0:int(num_units / number_of_groups)],
+                            h_res, 1e-5)
+        self.assertAllClose(gh_res[:, int(num_units / number_of_groups):],
+                            h_res, 1e-5)
 
 class LayerNormBasicLSTMCellTest(test.TestCase):
 
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn.py b/tensorflow/contrib/rnn/python/ops/core_rnn.py
index d254e717d5556b98161c2f66e1670233cfa53b4a..3ce075ce9c344eedd6018ec2ce400259f3a9aeff 100644
--- a/tensorflow/contrib/rnn/python/ops/core_rnn.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -31,7 +30,8 @@ from tensorflow.python.util import nest
 
 
 # pylint: disable=protected-access
-_state_size_with_prefix = rnn_cell_impl._state_size_with_prefix
+_concat = rnn_cell_impl._concat
+_like_rnncell = rnn_cell_impl._like_rnncell
 _infer_state_dtype = rnn._infer_state_dtype
 _reverse_seq = rnn._reverse_seq
 _rnn_step = rnn._rnn_step
@@ -99,7 +99,7 @@ def static_rnn(cell, inputs, initial_state=None, dtype=None,
       (column size) cannot be inferred from inputs via shape inference.
   """
 
-  if not isinstance(cell, core_rnn_cell.RNNCell):
+  if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
   if not nest.is_sequence(inputs):
     raise TypeError("inputs must be a sequence")
@@ -159,11 +159,10 @@ def static_rnn(cell, inputs, initial_state=None, dtype=None,
             "sequence_length must be a vector of length batch_size")
       def _create_zero_output(output_size):
         # convert int to TensorShape if necessary
-        size = _state_size_with_prefix(output_size, prefix=[batch_size])
+        size = _concat(batch_size, output_size)
         output = array_ops.zeros(
             array_ops.stack(size), _infer_state_dtype(dtype, state))
-        shape = _state_size_with_prefix(
-            output_size, prefix=[fixed_batch_size.value])
+        shape = _concat(fixed_batch_size.value, output_size, static=True)
         output.set_shape(tensor_shape.TensorShape(shape))
         return output
 
@@ -320,9 +319,9 @@ def static_bidirectional_rnn(cell_fw, cell_bw, inputs,
     ValueError: If inputs is None or an empty list.
   """
 
-  if not isinstance(cell_fw, core_rnn_cell.RNNCell):
+  if not _like_rnncell(cell_fw):
     raise TypeError("cell_fw must be an instance of RNNCell")
-  if not isinstance(cell_bw, core_rnn_cell.RNNCell):
+  if not _like_rnncell(cell_bw):
     raise TypeError("cell_bw must be an instance of RNNCell")
   if not nest.is_sequence(inputs):
     raise TypeError("inputs must be a sequence")
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
index 884b51926ebdedce5529d8760fcc09b132365ce8..51a12b2912774ad42417220db3b799cbb2b7a077 100644
--- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
@@ -27,7 +27,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import contextlib
 import hashlib
 import math
 import numbers
@@ -43,67 +42,25 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 
 from tensorflow.python.ops.math_ops import sigmoid
 from tensorflow.python.ops.math_ops import tanh
-from tensorflow.python.ops.rnn_cell_impl import _RNNCell as RNNCell
 
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
+# pylint: disable=protected-access
+RNNCell = rnn_cell_impl._RNNCell  # pylint: disable=invalid-name
+_like_rnncell = rnn_cell_impl._like_rnncell
+# pylint: enable=protected-access
+
 _BIAS_VARIABLE_NAME = "biases"
 _WEIGHTS_VARIABLE_NAME = "weights"
 
 
-@contextlib.contextmanager
-def _checked_scope(cell, scope, reuse=None, **kwargs):
-  if reuse is not None:
-    kwargs["reuse"] = reuse
-  with vs.variable_scope(scope, **kwargs) as checking_scope:
-    scope_name = checking_scope.name
-    if hasattr(cell, "_scope"):
-      cell_scope = cell._scope  # pylint: disable=protected-access
-      if cell_scope.name != checking_scope.name:
-        raise ValueError(
-            "Attempt to reuse RNNCell %s with a different variable scope than "
-            "its first use.  First use of cell was with scope '%s', this "
-            "attempt is with scope '%s'.  Please create a new instance of the "
-            "cell if you would like it to use a different set of weights.  "
-            "If before you were using: MultiRNNCell([%s(...)] * num_layers), "
-            "change to: MultiRNNCell([%s(...) for _ in range(num_layers)]).  "
-            "If before you were using the same cell instance as both the "
-            "forward and reverse cell of a bidirectional RNN, simply create "
-            "two instances (one for forward, one for reverse).  "
-            "In May 2017, we will start transitioning this cell's behavior "
-            "to use existing stored weights, if any, when it is called "
-            "with scope=None (which can lead to silent model degradation, so "
-            "this error will remain until then.)"
-            % (cell, cell_scope.name, scope_name, type(cell).__name__,
-               type(cell).__name__))
-    else:
-      weights_found = False
-      try:
-        with vs.variable_scope(checking_scope, reuse=True):
-          vs.get_variable(_WEIGHTS_VARIABLE_NAME)
-        weights_found = True
-      except ValueError:
-        pass
-      if weights_found and reuse is None:
-        raise ValueError(
-            "Attempt to have a second RNNCell use the weights of a variable "
-            "scope that already has weights: '%s'; and the cell was not "
-            "constructed as %s(..., reuse=True).  "
-            "To share the weights of an RNNCell, simply "
-            "reuse it in your second calculation, or create a new one with "
-            "the argument reuse=True." % (scope_name, type(cell).__name__))
-
-    # Everything is OK.  Update the cell's scope and yield it.
-    cell._scope = checking_scope  # pylint: disable=protected-access
-    yield checking_scope
-
-
 class BasicRNNCell(RNNCell):
   """The most basic RNN cell."""
 
@@ -131,12 +88,15 @@ class BasicRNNCell(RNNCell):
 class GRUCell(RNNCell):
   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
 
-  def __init__(self, num_units, input_size=None, activation=tanh, reuse=None):
+  def __init__(self, num_units, input_size=None, activation=tanh, reuse=None,
+               kernel_initializer=None, bias_initializer=None):
     super(GRUCell, self).__init__(_reuse=reuse)
     if input_size is not None:
       logging.warn("%s: The input_size parameter is deprecated.", self)
     self._num_units = num_units
     self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._bias_initializer = bias_initializer
 
   @property
   def state_size(self):
@@ -150,10 +110,16 @@ class GRUCell(RNNCell):
     """Gated recurrent unit (GRU) with nunits cells."""
     with vs.variable_scope("gates"):  # Reset gate and update gate.
       # We start with bias of 1.0 to not reset and not update.
-      value = sigmoid(_linear([inputs, state], 2 * self._num_units, True, 1.0))
+      bias_ones = self._bias_initializer
+      if self._bias_initializer is None:
+        dtype = [a.dtype for a in [inputs, state]][0]
+        bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
+      value = sigmoid(_linear([inputs, state], 2 * self._num_units, True,
+          bias_ones, self._kernel_initializer))
       r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
     with vs.variable_scope("candidate"):
-      c = self._activation(_linear([inputs, r * state], self._num_units, True))
+      c = self._activation(_linear([inputs, r * state], self._num_units, True,
+          self._bias_initializer, self._kernel_initializer))
     new_h = u * state + (1 - u) * c
     return new_h, new_h
 
@@ -472,7 +438,7 @@ class OutputProjectionWrapper(RNNCell):
       ValueError: if output_size is not positive.
     """
     super(OutputProjectionWrapper, self).__init__(_reuse=reuse)
-    if not isinstance(cell, RNNCell):
+    if not _like_rnncell(cell):
       raise TypeError("The parameter cell is not RNNCell.")
     if output_size < 1:
       raise ValueError("Parameter output_size must be > 0: %d." % output_size)
@@ -528,7 +494,7 @@ class InputProjectionWrapper(RNNCell):
     super(InputProjectionWrapper, self).__init__(_reuse=reuse)
     if input_size is not None:
       logging.warn("%s: The input_size parameter is deprecated.", self)
-    if not isinstance(cell, RNNCell):
+    if not _like_rnncell(cell):
       raise TypeError("The parameter cell is not RNNCell.")
     self._cell = cell
     self._num_proj = num_proj
@@ -604,7 +570,7 @@ class DropoutWrapper(RNNCell):
       TypeError: if cell is not an RNNCell.
       ValueError: if any of the keep_probs are not between 0 and 1.
     """
-    if not isinstance(cell, RNNCell):
+    if not _like_rnncell(cell):
       raise TypeError("The parameter cell is not a RNNCell.")
     with ops.name_scope("DropoutWrapperInit"):
       def tensor_and_const_value(v):
@@ -839,7 +805,7 @@ class EmbeddingWrapper(RNNCell):
       ValueError: if embedding_classes is not positive.
     """
     super(EmbeddingWrapper, self).__init__(_reuse=reuse)
-    if not isinstance(cell, RNNCell):
+    if not _like_rnncell(cell):
       raise TypeError("The parameter cell is not RNNCell.")
     if embedding_classes <= 0 or embedding_size <= 0:
       raise ValueError("Both embedding_classes and embedding_size must be > 0: "
@@ -1011,14 +977,16 @@ class _SlimRNNCell(RNNCell):
     return output, state
 
 
-def _linear(args, output_size, bias, bias_start=0.0):
+def _linear(args, output_size, bias, bias_initializer=None,
+            kernel_initializer=None):
   """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
 
   Args:
     args: a 2D Tensor or a list of 2D, batch x n, Tensors.
     output_size: int, second dimension of W[i].
     bias: boolean, whether to add a bias term or not.
-    bias_start: starting value to initialize the bias; 0 by default.
+    bias_initializer: starting value to initialize the bias; None by default.
+    kernel_initializer: starting value to initialize the weight; None by default.
 
   Returns:
     A 2D Tensor with shape [batch x output_size] equal to
@@ -1050,7 +1018,8 @@ def _linear(args, output_size, bias, bias_start=0.0):
   scope = vs.get_variable_scope()
   with vs.variable_scope(scope) as outer_scope:
     weights = vs.get_variable(
-        _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype)
+        _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype,
+        initializer=kernel_initializer)
     if len(args) == 1:
       res = math_ops.matmul(args[0], weights)
     else:
@@ -1059,8 +1028,10 @@ def _linear(args, output_size, bias, bias_start=0.0):
       return res
     with vs.variable_scope(outer_scope) as inner_scope:
       inner_scope.set_partitioner(None)
+      if bias_initializer is None:
+        bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
       biases = vs.get_variable(
           _BIAS_VARIABLE_NAME, [output_size],
           dtype=dtype,
-          initializer=init_ops.constant_initializer(bias_start, dtype=dtype))
+          initializer=bias_initializer)
     return nn_ops.bias_add(res, biases)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 83e8c2777f6fdf1665370358d82e6f2cc8cf8e9d..217c379c36f0fc54bdf50b691ef6e615b3562cf0 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -34,14 +34,12 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
-_checked_scope = core_rnn_cell_impl._checked_scope  # pylint: disable=protected-access
-
-
 def _get_concat_variable(name, shape, dtype, num_shards):
   """Get a sharded variable concatenated into one tensor."""
   sharded_variable = _get_sharded_variable(name, shape, dtype, num_shards)
@@ -1060,7 +1058,7 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
           `state_is_tuple` is `False` or if attn_length is zero or less.
     """
     super(AttentionCellWrapper, self).__init__(_reuse=reuse)
-    if not isinstance(cell, core_rnn_cell.RNNCell):
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("The parameter cell is not RNNCell.")
     if nest.is_sequence(cell.state_size) and not state_is_tuple:
       raise ValueError("Cell returns tuple of states, but the flag "
@@ -1157,6 +1155,89 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
       return new_attns, new_attn_states
 
 
+class HighwayWrapper(core_rnn_cell.RNNCell):
+  """RNNCell wrapper that adds highway connection on cell input and output.
+
+  Based on:
+    R. K. Srivastava, K. Greff, and J. Schmidhuber, "Highway networks",
+    arXiv preprint arXiv:1505.00387, 2015.
+    https://arxiv.org/abs/1505.00387
+  """
+
+  def __init__(self, cell,
+               couple_carry_transform_gates=True,
+               carry_bias_init=1.0):
+    """Constructs a `HighwayWrapper` for `cell`.
+
+    Args:
+      cell: An instance of `RNNCell`.
+      couple_carry_transform_gates: boolean, should the Carry and Transform gate
+        be coupled.
+      carry_bias_init: float, carry gates bias initialization.
+    """
+    self._cell = cell
+    self._couple_carry_transform_gates = couple_carry_transform_gates
+    self._carry_bias_init = carry_bias_init
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def _highway(self, inp, out):
+    input_size = inp.get_shape().with_rank(2)[1].value
+    carry_weight = vs.get_variable("carry_w", [input_size, input_size])
+    carry_bias = vs.get_variable(
+        "carry_b", [input_size],
+        initializer=init_ops.constant_initializer(
+            self._carry_bias_init))
+    carry = math_ops.sigmoid(nn_ops.xw_plus_b(inp, carry_weight, carry_bias))
+    if self._couple_carry_transform_gates:
+      transform = 1 - carry
+    else:
+      transform_weight = vs.get_variable("transform_w",
+                                         [input_size, input_size])
+      transform_bias = vs.get_variable(
+          "transform_b", [input_size],
+          initializer=init_ops.constant_initializer(
+              -self._carry_bias_init))
+      transform = math_ops.sigmoid(nn_ops.xw_plus_b(inp,
+                                                    transform_weight,
+                                                    transform_bias))
+    return inp * carry + out * transform
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell and add its inputs to its outputs.
+
+    Args:
+      inputs: cell inputs.
+      state: cell state.
+      scope: optional cell scope.
+
+    Returns:
+      Tuple of cell outputs and new state.
+
+    Raises:
+      TypeError: If cell inputs and outputs have different structure (type).
+      ValueError: If cell inputs and outputs have different structure (value).
+    """
+    outputs, new_state = self._cell(inputs, state, scope=scope)
+    nest.assert_same_structure(inputs, outputs)
+    # Ensure shapes match
+    def assert_shape_match(inp, out):
+      inp.get_shape().assert_is_compatible_with(out.get_shape())
+    nest.map_structure(assert_shape_match, inputs, outputs)
+    res_outputs = nest.map_structure(self._highway, inputs, outputs)
+    return (res_outputs, new_state)
+
+
 class LayerNormBasicLSTMCell(core_rnn_cell.RNNCell):
   """LSTM unit with layer normalization and recurrent dropout.
 
@@ -1843,3 +1924,178 @@ class PhasedLSTMCell(core_rnn_cell.RNNCell):
     new_state = core_rnn_cell.LSTMStateTuple(new_c, new_h)
 
     return new_h, new_state
+
+
+class GLSTMCell(core_rnn_cell.RNNCell):
+  """Group LSTM cell (G-LSTM).
+
+  The implementation is based on:
+
+    https://arxiv.org/abs/1703.10722
+
+  O. Kuchaiev and B. Ginsburg
+  "Factorization Tricks for LSTM Networks", ICLR 2017 workshop.
+  """
+
+  def __init__(self, num_units, initializer=None, num_proj=None,
+               number_of_groups=1, forget_bias=1.0, activation=math_ops.tanh,
+               reuse=None):
+    """Initialize the parameters of G-LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the G-LSTM cell
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      number_of_groups: (optional) int, number of groups to use.
+        If `number_of_groups` is 1, then it should be equivalent to LSTM cell
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training.
+      activation: Activation function of the inner states.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already
+        has the given variables, an error is raised.
+
+    Raises:
+      ValueError: If `num_units` or `num_proj` is not divisible by 
+        `number_of_groups`.
+    """
+    super(GLSTMCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._initializer = initializer
+    self._num_proj = num_proj
+    self._forget_bias = forget_bias
+    self._activation = activation
+    self._number_of_groups = number_of_groups
+
+    if self._num_units % self._number_of_groups != 0:
+      raise ValueError("num_units must be divisible by number_of_groups")
+    if self._num_proj:
+      if self._num_proj % self._number_of_groups != 0:
+        raise ValueError("num_proj must be divisible by number_of_groups")
+      self._group_shape = [int(self._num_proj / self._number_of_groups),
+                           int(self._num_units / self._number_of_groups)]
+    else:
+      self._group_shape = [int(self._num_units / self._number_of_groups),
+                           int(self._num_units / self._number_of_groups)]
+
+    if num_proj:
+      self._state_size = core_rnn_cell.LSTMStateTuple(num_units, num_proj)
+      self._output_size = num_proj
+    else:
+      self._state_size = core_rnn_cell.LSTMStateTuple(num_units, num_units)
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def _get_input_for_group(self, inputs, group_id, group_size):
+    """Slices inputs into groups to prepare for processing by cell's groups
+
+    Args:
+      inputs: cell input or it's previous state,
+              a Tensor, 2D, [batch x num_units]
+      group_id: group id, a Scalar, for which to prepare input
+      group_size: size of the group
+
+    Returns:
+      subset of inputs corresponding to group "group_id",
+      a Tensor, 2D, [batch x num_units/number_of_groups]
+    """
+    return array_ops.slice(input_=inputs,
+                           begin=[0, group_id * group_size],
+                           size=[self._batch_size, group_size],
+                           name=("GLSTM_group%d_input_generation" % group_id))
+
+  def call(self, inputs, state):
+    """Run one step of G-LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, [batch x num_units].
+      state: this must be a tuple of state Tensors, both `2-D`,
+      with column sizes `c_state` and `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        G-LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - LSTMStateTuple representing the new state of G-LSTM  cell
+        after reading `inputs` when the previous state was `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    (c_prev, m_prev) = state
+
+    self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
+    dtype = inputs.dtype
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope, initializer=self._initializer):
+      i_parts = []
+      j_parts = []
+      f_parts = []
+      o_parts = []
+
+      for group_id in range(self._number_of_groups):
+        with vs.variable_scope("group%d" % group_id):
+          x_g_id = array_ops.concat(
+            [self._get_input_for_group(inputs, group_id,
+                                       self._group_shape[0]),
+             self._get_input_for_group(m_prev, group_id,
+                                       self._group_shape[0])], axis=1)
+          R_k = _linear(x_g_id, 4 * self._group_shape[1], bias=False)
+          i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1)
+
+        i_parts.append(i_k)
+        j_parts.append(j_k)
+        f_parts.append(f_k)
+        o_parts.append(o_k)
+
+      bi = vs.get_variable(name="bias_i",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bj = vs.get_variable(name="bias_j",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bf = vs.get_variable(name="bias_f",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bo = vs.get_variable(name="bias_o",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+
+      i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi)
+      j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj)
+      f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf)
+      o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo)
+
+    c = (math_ops.sigmoid(f + self._forget_bias) * c_prev +
+         math_ops.sigmoid(i) * math_ops.tanh(j))
+    m = math_ops.sigmoid(o) * self._activation(c)
+
+    if self._num_proj is not None:
+      with vs.variable_scope("projection"):
+        m = _linear(m, self._num_proj, bias=False)
+
+    new_state = core_rnn_cell.LSTMStateTuple(c, m)
+    return m, new_state
diff --git a/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e29114b0cc5da1e6fe7495cd6aaec4c13d9bd85
--- /dev/null
+++ b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
@@ -0,0 +1,231 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Convert checkpoints using RNNCells to new name convention.
+
+Usage:
+
+  python checkpoint_convert [--write_v1_checkpoint] \
+      '/path/to/checkpoint' '/path/to/new_checkpoint'
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import re
+import sys
+
+from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import app
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver as saver_lib
+
+_RNN_NAME_REPLACEMENTS = collections.OrderedDict([
+    ############################################################################
+    # contrib/rnn/python/ops/core_rnn_cell_impl.py
+    # BasicRNNCell
+    ('basic_rnn_cell/weights', 'basic_rnn_cell/kernel'),
+    ('basic_rnn_cell/biases', 'basic_rnn_cell/bias'),
+    # GRUCell
+    ('gru_cell/weights', 'gru_cell/kernel'),
+    ('gru_cell/biases', 'gru_cell/bias'),
+    ('gru_cell/gates/weights', 'gru_cell/gates/kernel'),
+    ('gru_cell/gates/biases', 'gru_cell/gates/bias'),
+    ('gru_cell/candidate/weights', 'gru_cell/candidate/kernel'),
+    ('gru_cell/candidate/biases', 'gru_cell/candidate/bias'),
+    # BasicLSTMCell
+    ('basic_lstm_cell/weights', 'basic_lstm_cell/kernel'),
+    ('basic_lstm_cell/biases', 'basic_lstm_cell/bias'),
+    # LSTMCell
+    ('lstm_cell/weights', 'lstm_cell/kernel'),
+    ('lstm_cell/biases', 'lstm_cell/bias'),
+    ('lstm_cell/projection/weights', 'lstm_cell/projection/kernel'),
+    ('lstm_cell/projection/biases', 'lstm_cell/projection/bias'),
+    # OutputProjectionWrapper
+    ('output_projection_wrapper/weights', 'output_projection_wrapper/kernel'),
+    ('output_projection_wrapper/biases', 'output_projection_wrapper/bias'),
+    # InputProjectionWrapper
+    ('input_projection_wrapper/weights', 'input_projection_wrapper/kernel'),
+    ('input_projection_wrapper/biases', 'input_projection_wrapper/bias'),
+    ############################################################################
+    # contrib/rnn/python/ops/lstm_ops.py
+    # LSTMBlockFusedCell ??
+    ('lstm_block_wrapper/weights', 'lstm_block_wrapper/kernel'),
+    ('lstm_block_wrapper/biases', 'lstm_block_wrapper/bias'),
+    ############################################################################
+    # contrib/rnn/python/ops/rnn_cell.py
+    # LayerNormBasicLSTMCell
+    ('layer_norm_basic_lstm_cell/weights', 'layer_norm_basic_lstm_cell/kernel'),
+    ('layer_norm_basic_lstm_cell/biases', 'layer_norm_basic_lstm_cell/bias'),
+    # UGRNNCell, not found in g3, but still need it?
+    ('ugrnn_cell/weights', 'ugrnn_cell/kernel'),
+    ('ugrnn_cell/biases', 'ugrnn_cell/bias'),
+    # NASCell
+    ('nas_rnn/weights', 'nas_rnn/kernel'),
+    ('nas_rnn/recurrent_weights', 'nas_rnn/recurrent_kernel'),
+    # IntersectionRNNCell
+    ('intersection_rnn_cell/weights', 'intersection_rnn_cell/kernel'),
+    ('intersection_rnn_cell/biases', 'intersection_rnn_cell/bias'),
+    ('intersection_rnn_cell/in_projection/weights',
+     'intersection_rnn_cell/in_projection/kernel'),
+    ('intersection_rnn_cell/in_projection/biases',
+     'intersection_rnn_cell/in_projection/bias'),
+    # PhasedLSTMCell
+    ('phased_lstm_cell/mask_gates/weights',
+     'phased_lstm_cell/mask_gates/kernel'),
+    ('phased_lstm_cell/mask_gates/biases', 'phased_lstm_cell/mask_gates/bias'),
+    ('phased_lstm_cell/new_input/weights', 'phased_lstm_cell/new_input/kernel'),
+    ('phased_lstm_cell/new_input/biases', 'phased_lstm_cell/new_input/bias'),
+    ('phased_lstm_cell/output_gate/weights',
+     'phased_lstm_cell/output_gate/kernel'),
+    ('phased_lstm_cell/output_gate/biases',
+     'phased_lstm_cell/output_gate/bias'),
+    # AttentionCellWrapper
+    ('attention_cell_wrapper/weights', 'attention_cell_wrapper/kernel'),
+    ('attention_cell_wrapper/biases', 'attention_cell_wrapper/bias'),
+    ('attention_cell_wrapper/attn_output_projection/weights',
+     'attention_cell_wrapper/attn_output_projection/kernel'),
+    ('attention_cell_wrapper/attn_output_projection/biases',
+     'attention_cell_wrapper/attn_output_projection/bias'),
+    ('attention_cell_wrapper/attention/weights',
+     'attention_cell_wrapper/attention/kernel'),
+    ('attention_cell_wrapper/attention/biases',
+     'attention_cell_wrapper/attention/bias'),
+])
+
+_RNN_SHARDED_NAME_REPLACEMENTS = collections.OrderedDict([
+    ('LSTMCell/W_', 'lstm_cell/weights/part_'),
+    ('BasicLSTMCell/Linear/Matrix_', 'basic_lstm_cell/weights/part_'),
+    ('GRUCell/W_', 'gru_cell/weights/part_'),
+    ('MultiRNNCell/Cell', 'multi_rnn_cell/cell_'),
+])
+
+
+def _rnn_name_replacement(var_name):
+  for pattern in _RNN_NAME_REPLACEMENTS:
+    if pattern in var_name:
+      old_var_name = var_name
+      var_name = var_name.replace(pattern, _RNN_NAME_REPLACEMENTS[pattern])
+      logging.info('Converted: %s --> %s' % (old_var_name, var_name))
+      break
+  return var_name
+
+
+def _rnn_name_replacement_sharded(var_name):
+  for pattern in _RNN_SHARDED_NAME_REPLACEMENTS:
+    if pattern in var_name:
+      old_var_name = var_name
+      var_name = var_name.replace(pattern,
+                                  _RNN_SHARDED_NAME_REPLACEMENTS[pattern])
+      logging.info('Converted: %s --> %s' % (old_var_name, var_name))
+  return var_name
+
+
+def _split_sharded_vars(name_shape_map):
+  """Split shareded variables.
+
+  Args:
+    name_shape_map: A dict from variable name to variable shape.
+
+  Returns:
+    not_sharded: Names of the non-sharded variables.
+    sharded: Names of the sharded varibales.
+  """
+  sharded = []
+  not_sharded = []
+  for name in name_shape_map:
+    if re.match(name, '_[0-9]+$'):
+      if re.sub('_[0-9]+$', '_1', name) in name_shape_map:
+        sharded.append(name)
+      else:
+        not_sharded.append(name)
+    else:
+      not_sharded.append(name)
+  return not_sharded, sharded
+
+
+def convert_names(checkpoint_from_path,
+                  checkpoint_to_path,
+                  write_v1_checkpoint=False):
+  """Migrates the names of variables within a checkpoint.
+
+  Args:
+    checkpoint_from_path: Path to source checkpoint to be read in.
+    checkpoint_to_path: Path to checkpoint to be written out.
+    write_v1_checkpoint: Whether the output checkpoint will be in V1 format.
+
+  Returns:
+    A dictionary that maps the new variable names to the Variable objects.
+    A dictionary that maps the old variable names to the new variable names.
+  """
+  with ops.Graph().as_default():
+    logging.info('Reading checkpoint_from_path %s' % checkpoint_from_path)
+    reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_from_path)
+    name_shape_map = reader.get_variable_to_shape_map()
+    not_sharded, sharded = _split_sharded_vars(name_shape_map)
+    new_variable_map = {}
+    conversion_map = {}
+    for var_name in not_sharded:
+      new_var_name = _rnn_name_replacement(var_name)
+      tensor = reader.get_tensor(var_name)
+      var = variables.Variable(tensor, name=var_name)
+      new_variable_map[new_var_name] = var
+      if new_var_name != var_name:
+        conversion_map[var_name] = new_var_name
+    for var_name in sharded:
+      new_var_name = _rnn_name_replacement_sharded(var_name)
+      var = variables.Variable(tensor, name=var_name)
+      new_variable_map[new_var_name] = var
+      if new_var_name != var_name:
+        conversion_map[var_name] = new_var_name
+
+    write_version = (saver_pb2.SaverDef.V1
+                     if write_v1_checkpoint else saver_pb2.SaverDef.V2)
+    saver = saver_lib.Saver(new_variable_map, write_version=write_version)
+
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      logging.info('Writing checkpoint_to_path %s' % checkpoint_to_path)
+      saver.save(sess, checkpoint_to_path)
+
+  logging.info('Summary:')
+  logging.info('  Converted %d variable name(s).' % len(new_variable_map))
+  return new_variable_map, conversion_map
+
+
+def main(_):
+  convert_names(
+      FLAGS.checkpoint_from_path,
+      FLAGS.checkpoint_to_path,
+      write_v1_checkpoint=FLAGS.write_v1_checkpoint)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument('checkpoint_from_path', type=str,
+                      help='Path to source checkpoint to be read in.')
+  parser.add_argument('checkpoint_to_path', type=str,
+                      help='Path to checkpoint to be written out.')
+  parser.add_argument('--write_v1_checkpoint', action='store_true',
+                      help='Write v1 checkpoint')
+  FLAGS, unparsed = parser.parse_known_args()
+
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py b/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2fc2fa80eacc853c75e8a6019976c2583edc0f5
--- /dev/null
+++ b/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for checkpoint converter."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os
+import tempfile
+
+from tensorflow.contrib.rnn.python.tools import checkpoint_convert
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
+
+
+class CheckpointConvertTest(test.TestCase):
+
+  def setUp(self):
+    self._old_ckpt_path = tempfile.mktemp()
+    self._new_ckpt_path = tempfile.mktemp()
+    ops.reset_default_graph()
+
+  def tearDown(self):
+    for file_name in glob.glob(self._old_ckpt_path + "*"):
+      os.remove(file_name)
+    for file_name in glob.glob(self._new_ckpt_path + "*"):
+      os.remove(file_name)
+
+  def testReplacementDictsContainUniqueAndNonEmptyVariableNames(self):
+    for old_name in checkpoint_convert._RNN_NAME_REPLACEMENTS:
+      new_name = checkpoint_convert._RNN_NAME_REPLACEMENTS[old_name]
+      self.assertTrue(old_name)
+      self.assertTrue(new_name)
+      self.assertNotEqual(old_name, new_name)
+    for old_name in checkpoint_convert._RNN_SHARDED_NAME_REPLACEMENTS:
+      new_name = checkpoint_convert._RNN_SHARDED_NAME_REPLACEMENTS[old_name]
+      self.assertTrue(old_name)
+      self.assertTrue(new_name)
+      self.assertNotEqual(old_name, new_name)
+
+  def testConversionFromV2WithConvertedVariableNamesSucceeds(self):
+    variables.Variable(10.0, name="a")
+    for old_name in checkpoint_convert._RNN_NAME_REPLACEMENTS:
+      variables.Variable(20.0, name=old_name)
+    with session.Session() as sess:
+      saver = saver_lib.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, self._old_ckpt_path)
+
+    new_var_map, conversion_map = checkpoint_convert.convert_names(
+        self._old_ckpt_path, self._new_ckpt_path)
+    self.assertTrue(glob.glob(self._new_ckpt_path + "*"))
+    self.assertItemsEqual(
+        ["a"] + list(checkpoint_convert._RNN_NAME_REPLACEMENTS.values()),
+        new_var_map.keys())
+    self.assertEqual(checkpoint_convert._RNN_NAME_REPLACEMENTS, conversion_map)
+
+  def testConversionFromV2WithoutConvertedVariableNamesSucceeds(self):
+    variables.Variable(10.0, name="a")
+    with session.Session() as sess:
+      saver = saver_lib.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, self._old_ckpt_path)
+
+    new_var_map, conversion_map = checkpoint_convert.convert_names(
+        self._old_ckpt_path, self._new_ckpt_path)
+    self.assertItemsEqual(["a"], new_var_map.keys())
+    self.assertFalse(conversion_map)
+
+  def testConversionToV1Succeeds(self):
+    variables.Variable(10.0, name="a")
+    variables.Variable(
+        20.0, name=list(checkpoint_convert._RNN_NAME_REPLACEMENTS.keys())[-1])
+
+    with session.Session() as sess:
+      saver = saver_lib.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, self._old_ckpt_path)
+
+    new_var_map, conversion_map = checkpoint_convert.convert_names(
+        self._old_ckpt_path, self._new_ckpt_path, write_v1_checkpoint=True)
+    self.assertItemsEqual(
+        ["a", list(checkpoint_convert._RNN_NAME_REPLACEMENTS.values())[-1]],
+        new_var_map.keys())
+    self.assertEqual(
+        {list(checkpoint_convert._RNN_NAME_REPLACEMENTS.keys())[-1]:
+         list(checkpoint_convert._RNN_NAME_REPLACEMENTS.values())[-1]},
+        conversion_map)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index dd497197e34fda21125c1c08e840c878babf2172..dc159b93a3781cb2cf90eb99a0a9d9e1aecf573b 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -16,36 +16,6 @@
 """Ops for building neural network seq2seq decoders and losses.
 
 See the @{$python/contrib.seq2seq} guide.
-
-@@Decoder
-@@dynamic_decode
-
-@@BasicDecoderOutput
-@@BasicDecoder
-
-@@BeamSearchDecoderOutput
-@@BeamSearchDecoderState
-@@BeamSearchDecoder
-@@FinalBeamSearchDecoderOutput
-
-@@Helper
-@@CustomHelper
-@@GreedyEmbeddingHelper
-@@ScheduledEmbeddingTrainingHelper
-@@ScheduledOutputTrainingHelper
-@@TrainingHelper
-
-@@BahdanauAttention
-@@LuongAttention
-
-@@hardmax
-
-@@AttentionWrapperState
-@@AttentionWrapper
-
-@@gather_tree
-
-@@tile_batch
 """
 
 from __future__ import absolute_import
@@ -63,6 +33,30 @@ from tensorflow.contrib.seq2seq.python.ops.loss import *
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,widcard-import,line-too-long
 
-_allowed_symbols = ["sequence_loss"]
+_allowed_symbols = [
+    "sequence_loss",
+    "Decoder",
+    "dynamic_decode",
+    "BasicDecoder",
+    "BasicDecoderOutput",
+    "BeamSearchDecoder",
+    "BeamSearchDecoderOutput",
+    "BeamSearchDecoderState",
+    "Helper",
+    "CustomHelper",
+    "FinalBeamSearchDecoderOutput",
+    "gather_tree",
+    "GreedyEmbeddingHelper",
+    "ScheduledEmbeddingTrainingHelper",
+    "ScheduledOutputTrainingHelper",
+    "TrainingHelper",
+    "BahdanauAttention",
+    "LuongAttention",
+    "hardmax",
+    "AttentionWrapperState",
+    "AttentionWrapper",
+    "AttentionMechanism",
+    "tile_batch"]
+
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 40b50338adc17fbaa48d09e0f8be4c01fd302c55..b8b420e10a7e3d6c5cb5d4d78aecbf754ab638fc 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 # pylint: enable=unused-import
 
-import sys
+import collections
 import functools
 
 import numpy as np
@@ -46,15 +46,27 @@ BasicDecoderOutput = basic_decoder.BasicDecoderOutput  # pylint: disable=invalid
 float32 = np.float32
 int32 = np.int32
 array = np.array
+dtype = np.dtype
+
+
+class ResultSummary(
+    collections.namedtuple('ResultSummary', ('shape', 'dtype', 'mean'))):
+  pass
+
+
+def get_result_summary(x):
+  if isinstance(x, np.ndarray):
+    return ResultSummary(x.shape, x.dtype, x.mean())
+  return x
 
 
 class AttentionWrapperTest(test.TestCase):
 
-  def assertAllClose(self, *args, **kwargs):
-    kwargs["atol"] = 1e-4  # For GPU tests
-    kwargs["rtol"] = 1e-4  # For GPU tests
-    return super(AttentionWrapperTest, self).assertAllClose(
-        *args, **kwargs)
+  def assertAllCloseOrEqual(self, x, y, **kwargs):
+    if isinstance(x, np.ndarray) or isinstance(x, float):
+      return super(AttentionWrapperTest, self).assertAllClose(x, y, **kwargs)
+    else:
+      self.assertAllEqual(x, y, **kwargs)
 
   def testAttentionWrapperState(self):
     num_fields = len(wrapper.AttentionWrapperState._fields)  # pylint: disable=protected-access
@@ -71,7 +83,7 @@ class AttentionWrapperTest(test.TestCase):
                          alignment_history=False,
                          expected_final_alignment_history=None,
                          attention_layer_size=6,
-                         name=""):
+                         name=''):
     encoder_sequence_length = [3, 2, 3, 1, 0]
     decoder_sequence_length = [2, 0, 1, 2, 3]
     batch_size = 5
@@ -98,7 +110,7 @@ class AttentionWrapperTest(test.TestCase):
 
     with self.test_session(use_gpu=True) as sess:
       with vs.variable_scope(
-          "root",
+          'root',
           initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
         cell = core_rnn_cell.LSTMCell(cell_depth)
         cell = wrapper.AttentionWrapper(
@@ -147,192 +159,53 @@ class AttentionWrapperTest(test.TestCase):
 
       sess.run(variables.global_variables_initializer())
       sess_results = sess.run({
-          "final_outputs": final_outputs,
-          "final_state": final_state,
-          "state_alignment_history": state_alignment_history,
+          'final_outputs': final_outputs,
+          'final_state': final_state,
+          'state_alignment_history': state_alignment_history,
       })
 
-      print("Copy/paste (%s)\nexpected_final_output = " % name,
-            sess_results["final_outputs"])
-      sys.stdout.flush()
-      print("Copy/paste (%s)\nexpected_final_state = " % name,
-            sess_results["final_state"])
-      sys.stdout.flush()
-      print("Copy/paste (%s)\nexpected_final_alignment_history = " % name,
-            np.asarray(sess_results["state_alignment_history"]))
-      sys.stdout.flush()
-      nest.map_structure(self.assertAllClose, expected_final_output,
-                         sess_results["final_outputs"])
-      nest.map_structure(self.assertAllClose, expected_final_state,
-                         sess_results["final_state"])
+      final_output_info = nest.map_structure(get_result_summary,
+                                             sess_results['final_outputs'])
+      final_state_info = nest.map_structure(get_result_summary,
+                                            sess_results['final_state'])
+      print('Copy/paste:\nexpected_final_output = %s' % str(final_output_info))
+      print('expected_final_state = %s' % str(final_state_info))
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_output,
+                         final_output_info)
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_state,
+                         final_state_info)
       if alignment_history:  # by default, the wrapper emits attention as output
-        self.assertAllClose(
+        final_alignment_history_info = nest.map_structure(
+            get_result_summary, sess_results['state_alignment_history'])
+        print('expected_final_alignment_history = %s' %
+              str(final_alignment_history_info))
+        nest.map_structure(
+            self.assertAllCloseOrEqual,
             # outputs are batch major but the stacked TensorArray is time major
-            sess_results["state_alignment_history"],
-            expected_final_alignment_history)
+            expected_final_alignment_history,
+            final_alignment_history_info)
 
   def testBahdanauNotNormalized(self):
     create_attention_mechanism = wrapper.BahdanauAttention
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=array(
-            [[[
-                2.04633363e-03, 1.89259532e-03, 2.09550979e-03, -3.81628517e-03,
-                -4.36160620e-03, -6.43933658e-03
-            ], [
-                2.41885195e-03, 2.02089013e-03, 2.05879519e-03, -3.85483308e-03,
-                -3.51473060e-03, -6.14458136e-03
-            ], [
-                2.02294230e-03, 2.06955452e-03, 2.34797411e-03, -3.62816593e-03,
-                -3.80352931e-03, -6.27150526e-03
-            ]], [[
-                4.89025004e-03, -1.97221269e-03, 3.34283570e-03,
-                -2.79326970e-03, 3.63148772e-03, -4.79645561e-03
-            ], [
-                5.13446378e-03, -2.03941623e-03, 3.51774949e-03,
-                -2.83448119e-03, 3.14159272e-03, -5.31486655e-03
-            ], [
-                5.20701287e-03, -2.21262546e-03, 3.58187454e-03,
-                -2.85831164e-03, 3.20822699e-03, -5.20829484e-03
-            ]], [[
-                -1.34046993e-03, -9.99792013e-04, -2.11631414e-03,
-                -1.85202830e-03, -5.26227616e-03, -9.08544939e-03
-            ], [
-                -1.35486713e-03, -1.04408595e-03, -1.96779310e-03,
-                -1.80004584e-03, -5.61304903e-03, -9.34211537e-03
-            ], [
-                -1.12452905e-03, -7.68281636e-04, -1.99770415e-03,
-                -1.88058324e-03, -5.01882844e-03, -9.32228006e-03
-            ]], [[
-                1.52967637e-03, -3.97213362e-03, -9.64699371e-04,
-                8.51419638e-04, -1.29806029e-03, 6.56482670e-03
-            ], [
-                1.22562144e-03, -4.56351135e-03, -1.08190742e-03,
-                8.27267300e-04, -2.10060296e-03, 6.43097097e-03
-            ], [
-                9.93521884e-04, -4.37386986e-03, -1.41534151e-03,
-                6.44790183e-04, -2.16482091e-03, 6.68301852e-03
-            ]], [[
-                -3.78854020e-04, 5.62231544e-05, 1.06837302e-04, 1.87137164e-04,
-                -1.56512906e-04, 9.63474595e-05
-            ], [
-                -1.04306288e-04, -1.37411975e-04, 2.82689070e-05,
-                6.56487318e-05, -1.48634164e-04, -1.84347919e-05
-            ], [
-                1.24452345e-04, 2.20821079e-04, 4.07114130e-04, 2.18028668e-04,
-                2.73401442e-04, -2.69805576e-04
-            ]]],
-            dtype=float32),
-        sample_id=array(
-            [[2, 0, 2], [0, 0, 0], [1, 1, 1], [5, 5, 5], [3, 3, 2]],
-            dtype=int32))
-
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00083043973),
+        sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=array(
-                [[
-                    -2.18977481e-02, -8.04181397e-03, -1.48273818e-03,
-                    1.61075518e-02, -1.37986457e-02, -7.57964421e-03,
-                    -8.28644261e-03, -1.18742418e-02, 1.78838037e-02
-                ], [
-                    1.74201727e-02, -1.41931782e-02, -3.88098788e-03,
-                    3.19711640e-02, -3.54694054e-02, -2.14694049e-02,
-                    -6.21706853e-03, -1.69323490e-03, -1.94494929e-02
-                ], [
-                    -1.14532551e-02, 8.77828151e-03, -1.62972715e-02,
-                    -1.39963031e-02, 1.34832524e-02, -1.04488730e-02,
-                    6.16201758e-03, -9.41041857e-03, -6.57599326e-03
-                ], [
-                    -4.74753827e-02, -1.19123599e-02, -7.40140676e-05,
-                    4.10552323e-02, -1.36711076e-03, 2.11795494e-02,
-                    -2.80460101e-02, -5.44509329e-02, -2.91906092e-02
-                ], [
-                    2.25644894e-02, -1.40382675e-03, 1.92396250e-02,
-                    5.49034867e-03, -1.27930511e-02, -3.15603940e-03,
-                    -5.05525898e-03, 2.19191350e-02, 1.62497871e-02
-                ]],
-                dtype=float32),
-            h=array(
-                [[
-                    -1.09847616e-02, -3.97357112e-03, -7.54502777e-04,
-                    7.91223347e-03, -7.02199014e-03, -3.80705344e-03,
-                    -4.22102772e-03, -6.05491130e-03, 8.92073940e-03
-                ], [
-                    8.68115202e-03, -7.16950046e-03, -1.88387593e-03,
-                    1.62680726e-02, -1.76830068e-02, -1.06620435e-02,
-                    -3.07523785e-03, -8.46023730e-04, -9.99386702e-03
-                ], [
-                    -5.71225956e-03, 4.50055022e-03, -8.07653368e-03,
-                    -6.94842264e-03, 6.75687613e-03, -5.12083014e-03,
-                    3.06244940e-03, -4.61752573e-03, -3.23935854e-03
-                ], [
-                    -2.37231534e-02, -5.88526297e-03, -3.72226204e-05,
-                    2.01789513e-02, -6.75848918e-04, 1.06686372e-02,
-                    -1.42624676e-02, -2.69628745e-02, -1.45034352e-02
-                ], [
-                    1.12585640e-02, -6.92534202e-04, 9.88917705e-03,
-                    2.75237625e-03, -6.56115822e-03, -1.57997780e-03,
-                    -2.54477374e-03, 1.11598391e-02, 7.94144534e-03
-                ]],
-                dtype=float32)),
-        attention=array(
-            [[
-                0.00202294, 0.00206955, 0.00234797, -0.00362817, -0.00380353,
-                -0.00627151
-            ], [
-                0.00520701, -0.00221263, 0.00358187, -0.00285831, 0.00320823,
-                -0.00520829
-            ], [
-                -0.00112453, -0.00076828, -0.0019977, -0.00188058, -0.00501883,
-                -0.00932228
-            ], [
-                0.00099352, -0.00437387, -0.00141534, 0.00064479, -0.00216482,
-                0.00668302
-            ], [
-                0.00012445, 0.00022082, 0.00040711, 0.00021803, 0.0002734,
-                -0.00026981
-            ]],
-            dtype=float32),
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039763632),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019849765)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00081052497),
         time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
-
-    expected_final_alignment_history = [[[
-        0.12586178, 0.12272788, 0.1271652, 0.12484902, 0.12484902, 0.12484902,
-        0.12484902, 0.12484902
-    ], [
-        0.12612638, 0.12516938, 0.12478404, 0.12478404, 0.12478404, 0.12478404,
-        0.12478404, 0.12478404
-    ], [
-        0.12595113, 0.12515794, 0.1255464, 0.1246689, 0.1246689, 0.1246689,
-        0.1246689, 0.1246689
-    ], [
-        0.12492912, 0.12501013, 0.12501013, 0.12501013, 0.12501013, 0.12501013,
-        0.12501013, 0.12501013
-    ], [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]], [[
-        0.12586173, 0.12272781, 0.12716517, 0.12484905, 0.12484905, 0.12484905,
-        0.12484905, 0.12484905
-    ], [
-        0.12612617, 0.1251694, 0.12478408, 0.12478408, 0.12478408, 0.12478408,
-        0.12478408, 0.12478408
-    ], [
-        0.12595108, 0.12515777, 0.1255464, 0.12466895, 0.12466895, 0.12466895,
-        0.12466895, 0.12466895
-    ], [
-        0.12492914, 0.12501012, 0.12501012, 0.12501012, 0.12501012, 0.12501012,
-        0.12501012, 0.12501012
-    ], [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]], [[
-        0.12586181, 0.12272815, 0.12716556, 0.12484891, 0.12484891, 0.12484891,
-        0.12484891, 0.12484891
-    ], [
-        0.12612608, 0.12516941, 0.12478409, 0.12478409, 0.12478409, 0.12478409,
-        0.12478409, 0.12478409
-    ], [
-        0.12595116, 0.12515792, 0.12554643, 0.1246689, 0.1246689, 0.1246689,
-        0.1246689, 0.1246689
-    ], [
-        0.1249292, 0.12501012, 0.12501012, 0.12501012, 0.12501012, 0.12501012,
-        0.12501012, 0.12501012
-    ], [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]]]
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.12500001)
 
     self._testWithAttention(
         create_attention_mechanism,
@@ -340,263 +213,54 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_state,
         alignment_history=True,
         expected_final_alignment_history=expected_final_alignment_history,
-        name="testBahdanauNotNormalized")
+        name='testBahdanauNotNormalized')
 
   def testBahdanauNormalized(self):
     create_attention_mechanism = functools.partial(
         wrapper.BahdanauAttention, normalize=True)
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=array(
-            [[[
-                1.27064800e-02, 3.57783446e-03, 8.22613202e-03, -1.61504047e-03,
-                -1.12555185e-02, -3.92740499e-03
-            ], [
-                1.30781950e-02, 3.70747922e-03, 8.18992872e-03, -1.65389013e-03,
-                -1.04098395e-02, -3.63383139e-03
-            ], [
-                1.26833543e-02, 3.75790196e-03, 8.48123431e-03, -1.42690970e-03,
-                -1.07016256e-02, -3.76088684e-03
-            ]], [[
-                6.88417302e-03, -2.04071682e-03, 4.17768257e-03,
-                -4.51408979e-03, 4.90086433e-03, -6.85973791e-03
-            ], [
-                7.12782983e-03, -2.10783770e-03, 4.35227761e-03,
-                -4.55496181e-03, 4.41066315e-03, -7.37757795e-03
-            ], [
-                7.20011396e-03, -2.28102156e-03, 4.41620918e-03,
-                -4.57867794e-03, 4.47713351e-03, -7.27072079e-03
-            ]], [[
-                -2.20676698e-03, -1.43745833e-03, -1.99429039e-03,
-                -1.44722988e-03, -7.45461835e-03, -9.80243273e-03
-            ], [
-                -2.22120387e-03, -1.48139545e-03, -1.84528576e-03,
-                -1.39490096e-03, -7.80559657e-03, -1.00586927e-02
-            ], [
-                -1.99079141e-03, -1.20571791e-03, -1.87507609e-03,
-                -1.47541985e-03, -7.21158786e-03, -1.00391749e-02
-            ]], [[
-                1.48755650e-03, -3.89118027e-03, -9.40889120e-04,
-                8.36852356e-04, -1.28285377e-03, 6.41521579e-03
-            ], [
-                1.18351437e-03, -4.48258361e-03, -1.05809816e-03,
-                8.12723883e-04, -2.08540238e-03, 6.28142804e-03
-            ], [
-                9.51444614e-04, -4.29300033e-03, -1.39154412e-03,
-                6.30271854e-04, -2.14963360e-03, 6.53359853e-03
-            ]], [[
-                -3.78854020e-04, 5.62231544e-05, 1.06837302e-04, 1.87137164e-04,
-                -1.56512906e-04, 9.63474595e-05
-            ], [
-                -1.04306288e-04, -1.37411975e-04, 2.82689070e-05,
-                6.56487318e-05, -1.48634164e-04, -1.84347919e-05
-            ], [
-                1.24452345e-04, 2.20821079e-04, 4.07114130e-04, 2.18028668e-04,
-                2.73401442e-04, -2.69805576e-04
-            ]]],
-            dtype=float32),
-        sample_id=array(
-            [[0, 0, 0], [0, 0, 0], [1, 3, 1], [5, 5, 5], [3, 3, 2]],
-            dtype=int32))
-
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00040482997),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.8666666666666667))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=array(
-                [[
-                    -2.19953191e-02, -7.81358499e-03, -1.42740645e-03,
-                    1.62037201e-02, -1.38600282e-02, -7.60386931e-03,
-                    -8.42390209e-03, -1.18884994e-02, 1.78821683e-02
-                ], [
-                    1.74096227e-02, -1.41773149e-02, -3.89175024e-03,
-                    3.19635086e-02, -3.54669318e-02, -2.14924756e-02,
-                    -6.20695669e-03, -1.73213519e-03, -1.94583312e-02
-                ], [
-                    -1.14590004e-02, 8.76899902e-03, -1.62825100e-02,
-                    -1.39863417e-02, 1.34333782e-02, -1.04652103e-02,
-                    6.13503950e-03, -9.39247012e-03, -6.57595927e-03
-                ], [
-                    -4.74739373e-02, -1.19136302e-02, -7.36713409e-05,
-                    4.10547927e-02, -1.36768632e-03, 2.11772211e-02,
-                    -2.80480143e-02, -5.44514954e-02, -2.91903671e-02
-                ], [
-                    2.25644894e-02, -1.40382675e-03, 1.92396250e-02,
-                    5.49034867e-03, -1.27930511e-02, -3.15603940e-03,
-                    -5.05525898e-03, 2.19191350e-02, 1.62497871e-02
-                ]],
-                dtype=float32),
-            h=array(
-                [[
-                    -1.10325804e-02, -3.86056723e-03, -7.26287195e-04,
-                    7.95945339e-03, -7.05253659e-03, -3.81913339e-03,
-                    -4.29130904e-03, -6.06246945e-03, 8.91948957e-03
-                ], [
-                    8.67583323e-03, -7.16136536e-03, -1.88911252e-03,
-                    1.62639488e-02, -1.76817775e-02, -1.06735229e-02,
-                    -3.07015004e-03, -8.65494134e-04, -9.99815390e-03
-                ], [
-                    -5.71519835e-03, 4.49585915e-03, -8.06909613e-03,
-                    -6.94347266e-03, 6.73189852e-03, -5.12895826e-03,
-                    3.04909074e-03, -4.60868096e-03, -3.23936995e-03
-                ], [
-                    -2.37224363e-02, -5.88588836e-03, -3.70502457e-05,
-                    2.01787297e-02, -6.76134136e-04, 1.06674768e-02,
-                    -1.42634623e-02, -2.69631669e-02, -1.45033086e-02
-                ], [
-                    1.12585640e-02, -6.92534202e-04, 9.88917705e-03,
-                    2.75237625e-03, -6.56115822e-03, -1.57997780e-03,
-                    -2.54477374e-03, 1.11598391e-02, 7.94144534e-03
-                ]],
-                dtype=float32)),
-        attention=array(
-            [[
-                0.01268335, 0.0037579, 0.00848123, -0.00142691, -0.01070163,
-                -0.00376089
-            ], [
-                0.00720011, -0.00228102, 0.00441621, -0.00457868, 0.00447713,
-                -0.00727072
-            ], [
-                -0.00199079, -0.00120572, -0.00187508, -0.00147542, -0.00721159,
-                -0.01003917
-            ], [
-                0.00095144, -0.004293, -0.00139154, 0.00063027, -0.00214963,
-                0.0065336
-            ], [
-                0.00012445, 0.00022082, 0.00040711, 0.00021803, 0.0002734,
-                -0.00026981
-            ]],
-            dtype=float32),
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039785588),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019861322)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00038488387),
         time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
         create_attention_mechanism,
         expected_final_output,
         expected_final_state,
-        name="testBahdanauNormalized")
+        name='testBahdanauNormalized')
 
   def testLuongNotNormalized(self):
     create_attention_mechanism = wrapper.LuongAttention
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=array(
-            [[[
-                1.74922391e-03, 1.85935036e-03, 1.90880906e-03, -3.96941090e-03,
-                -4.17229906e-03, -6.65769773e-03
-            ], [
-                1.99638237e-03, 1.91135216e-03, 1.73234346e-03, -4.00905171e-03,
-                -3.15058464e-03, -6.34974428e-03
-            ], [
-                2.08854163e-03, 2.13832827e-03, 2.49780947e-03, -3.52849509e-03,
-                -3.96897132e-03, -6.12034509e-03
-            ]], [[
-                4.76492243e-03, -1.97180966e-03, 3.29327444e-03,
-                -2.68205139e-03, 3.55229783e-03, -4.66645230e-03
-            ], [
-                5.24956919e-03, -2.00631656e-03, 3.53828911e-03,
-                -2.96283513e-03, 3.20920302e-03, -5.43697737e-03
-            ], [
-                5.30424621e-03, -2.17913301e-03, 3.59509978e-03,
-                -2.97106663e-03, 3.26450402e-03, -5.31189423e-03
-            ]], [[
-                -1.36440888e-03, -9.75572329e-04, -2.11284542e-03,
-                -1.84616144e-03, -5.31351101e-03, -9.12462734e-03
-            ], [
-                -1.41863467e-03, -1.11081311e-03, -1.94056751e-03,
-                -1.74311269e-03, -5.76282106e-03, -9.29267984e-03
-            ], [
-                -1.12129003e-03, -8.15156149e-04, -2.01535341e-03,
-                -1.89556007e-03, -5.04226238e-03, -9.37188603e-03
-            ]], [[
-                1.55163277e-03, -4.01433324e-03, -9.77111282e-04,
-                8.59013060e-04, -1.30598655e-03, 6.64281659e-03
-            ], [
-                1.26811734e-03, -4.64518648e-03, -1.10593368e-03,
-                8.41954607e-04, -2.11594440e-03, 6.58190623e-03
-            ], [
-                1.02682540e-03, -4.43787826e-03, -1.43417739e-03,
-                6.56281307e-04, -2.17684195e-03, 6.80128345e-03
-            ]], [[
-                -3.78854020e-04, 5.62231544e-05, 1.06837302e-04, 1.87137164e-04,
-                -1.56512906e-04, 9.63474595e-05
-            ], [
-                -1.04306288e-04, -1.37411975e-04, 2.82689070e-05,
-                6.56487318e-05, -1.48634164e-04, -1.84347919e-05
-            ], [
-                1.24452345e-04, 2.20821079e-04, 4.07114130e-04, 2.18028668e-04,
-                2.73401442e-04, -2.69805576e-04
-            ]]],
-            dtype=float32),
-        sample_id=array(
-            [[2, 0, 2], [0, 0, 0], [1, 1, 1], [5, 5, 5], [3, 3, 2]],
-            dtype=int32))
-
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338),
+        sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=array(
-                [[
-                    -2.18942575e-02, -8.05099495e-03, -1.48526859e-03,
-                    1.61030665e-02, -1.37967104e-02, -7.57982396e-03,
-                    -8.28088820e-03, -1.18743815e-02, 1.78839806e-02
-                ], [
-                    1.74203254e-02, -1.41929490e-02, -3.88103351e-03,
-                    3.19709182e-02, -3.54691371e-02, -2.14697979e-02,
-                    -6.21709181e-03, -1.69324467e-03, -1.94495786e-02
-                ], [
-                    -1.14536462e-02, 8.77809525e-03, -1.62965059e-02,
-                    -1.39955431e-02, 1.34810507e-02, -1.04491040e-02,
-                    6.16097450e-03, -9.40943789e-03, -6.57613343e-03
-                ], [
-                    -4.74765450e-02, -1.19113335e-02, -7.42897391e-05,
-                    4.10555862e-02, -1.36665069e-03, 2.11814232e-02,
-                    -2.80444007e-02, -5.44504896e-02, -2.91908123e-02
-                ], [
-                    2.25644894e-02, -1.40382675e-03, 1.92396250e-02,
-                    5.49034867e-03, -1.27930511e-02, -3.15603940e-03,
-                    -5.05525898e-03, 2.19191350e-02, 1.62497871e-02
-                ]],
-                dtype=float32),
-            h=array(
-                [[
-                    -1.09830676e-02, -3.97811923e-03, -7.55793473e-04,
-                    7.91002903e-03, -7.02103321e-03, -3.80714820e-03,
-                    -4.21818346e-03, -6.05497835e-03, 8.92084371e-03
-                ], [
-                    8.68122280e-03, -7.16937613e-03, -1.88389909e-03,
-                    1.62679367e-02, -1.76828820e-02, -1.06622437e-02,
-                    -3.07524228e-03, -8.46030540e-04, -9.99389403e-03
-                ], [
-                    -5.71245840e-03, 4.50045895e-03, -8.07614625e-03,
-                    -6.94804778e-03, 6.75577158e-03, -5.12094703e-03,
-                    3.06193763e-03, -4.61703911e-03, -3.23943049e-03
-                ], [
-                    -2.37237271e-02, -5.88475820e-03, -3.73612711e-05,
-                    2.01791357e-02, -6.75620860e-04, 1.06695695e-02,
-                    -1.42616741e-02, -2.69626491e-02, -1.45035451e-02
-                ], [
-                    1.12585640e-02, -6.92534202e-04, 9.88917705e-03,
-                    2.75237625e-03, -6.56115822e-03, -1.57997780e-03,
-                    -2.54477374e-03, 1.11598391e-02, 7.94144534e-03
-                ]],
-                dtype=float32)),
-        attention=array(
-            [[
-                0.00208854, 0.00213833, 0.00249781, -0.0035285, -0.00396897,
-                -0.00612035
-            ], [
-                0.00530425, -0.00217913, 0.0035951, -0.00297107, 0.0032645,
-                -0.00531189
-            ], [
-                -0.00112129, -0.00081516, -0.00201535, -0.00189556, -0.00504226,
-                -0.00937189
-            ], [
-                0.00102683, -0.00443788, -0.00143418, 0.00065628, -0.00217684,
-                0.00680128
-            ], [
-                0.00012445, 0.00022082, 0.00040711, 0.00021803, 0.0002734,
-                -0.00026981
-            ]],
-            dtype=float32),
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603),
         time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -604,132 +268,27 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_output,
         expected_final_state,
         attention_mechanism_depth=9,
-        name="testLuongNotNormalized")
+        name='testLuongNotNormalized')
 
   def testLuongScaled(self):
     create_attention_mechanism = functools.partial(
         wrapper.LuongAttention, scale=True)
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=array(
-            [[[
-                1.74922391e-03, 1.85935036e-03, 1.90880906e-03, -3.96941090e-03,
-                -4.17229906e-03, -6.65769773e-03
-            ], [
-                1.99638237e-03, 1.91135216e-03, 1.73234346e-03, -4.00905171e-03,
-                -3.15058464e-03, -6.34974428e-03
-            ], [
-                2.08854163e-03, 2.13832827e-03, 2.49780947e-03, -3.52849509e-03,
-                -3.96897132e-03, -6.12034509e-03
-            ]], [[
-                4.76492243e-03, -1.97180966e-03, 3.29327444e-03,
-                -2.68205139e-03, 3.55229783e-03, -4.66645230e-03
-            ], [
-                5.24956919e-03, -2.00631656e-03, 3.53828911e-03,
-                -2.96283513e-03, 3.20920302e-03, -5.43697737e-03
-            ], [
-                5.30424621e-03, -2.17913301e-03, 3.59509978e-03,
-                -2.97106663e-03, 3.26450402e-03, -5.31189423e-03
-            ]], [[
-                -1.36440888e-03, -9.75572329e-04, -2.11284542e-03,
-                -1.84616144e-03, -5.31351101e-03, -9.12462734e-03
-            ], [
-                -1.41863467e-03, -1.11081311e-03, -1.94056751e-03,
-                -1.74311269e-03, -5.76282106e-03, -9.29267984e-03
-            ], [
-                -1.12129003e-03, -8.15156149e-04, -2.01535341e-03,
-                -1.89556007e-03, -5.04226238e-03, -9.37188603e-03
-            ]], [[
-                1.55163277e-03, -4.01433324e-03, -9.77111282e-04,
-                8.59013060e-04, -1.30598655e-03, 6.64281659e-03
-            ], [
-                1.26811734e-03, -4.64518648e-03, -1.10593368e-03,
-                8.41954607e-04, -2.11594440e-03, 6.58190623e-03
-            ], [
-                1.02682540e-03, -4.43787826e-03, -1.43417739e-03,
-                6.56281307e-04, -2.17684195e-03, 6.80128345e-03
-            ]], [[
-                -3.78854020e-04, 5.62231544e-05, 1.06837302e-04, 1.87137164e-04,
-                -1.56512906e-04, 9.63474595e-05
-            ], [
-                -1.04306288e-04, -1.37411975e-04, 2.82689070e-05,
-                6.56487318e-05, -1.48634164e-04, -1.84347919e-05
-            ], [
-                1.24452345e-04, 2.20821079e-04, 4.07114130e-04, 2.18028668e-04,
-                2.73401442e-04, -2.69805576e-04
-            ]]],
-            dtype=float32),
-        sample_id=array(
-            [[2, 0, 2], [0, 0, 0], [1, 1, 1], [5, 5, 5], [3, 3, 2]],
-            dtype=int32))
-
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338),
+        sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=array(
-                [[
-                    -2.18942575e-02, -8.05099495e-03, -1.48526859e-03,
-                    1.61030665e-02, -1.37967104e-02, -7.57982396e-03,
-                    -8.28088820e-03, -1.18743815e-02, 1.78839806e-02
-                ], [
-                    1.74203254e-02, -1.41929490e-02, -3.88103351e-03,
-                    3.19709182e-02, -3.54691371e-02, -2.14697979e-02,
-                    -6.21709181e-03, -1.69324467e-03, -1.94495786e-02
-                ], [
-                    -1.14536462e-02, 8.77809525e-03, -1.62965059e-02,
-                    -1.39955431e-02, 1.34810507e-02, -1.04491040e-02,
-                    6.16097450e-03, -9.40943789e-03, -6.57613343e-03
-                ], [
-                    -4.74765450e-02, -1.19113335e-02, -7.42897391e-05,
-                    4.10555862e-02, -1.36665069e-03, 2.11814232e-02,
-                    -2.80444007e-02, -5.44504896e-02, -2.91908123e-02
-                ], [
-                    2.25644894e-02, -1.40382675e-03, 1.92396250e-02,
-                    5.49034867e-03, -1.27930511e-02, -3.15603940e-03,
-                    -5.05525898e-03, 2.19191350e-02, 1.62497871e-02
-                ]],
-                dtype=float32),
-            h=array(
-                [[
-                    -1.09830676e-02, -3.97811923e-03, -7.55793473e-04,
-                    7.91002903e-03, -7.02103321e-03, -3.80714820e-03,
-                    -4.21818346e-03, -6.05497835e-03, 8.92084371e-03
-                ], [
-                    8.68122280e-03, -7.16937613e-03, -1.88389909e-03,
-                    1.62679367e-02, -1.76828820e-02, -1.06622437e-02,
-                    -3.07524228e-03, -8.46030540e-04, -9.99389403e-03
-                ], [
-                    -5.71245840e-03, 4.50045895e-03, -8.07614625e-03,
-                    -6.94804778e-03, 6.75577158e-03, -5.12094703e-03,
-                    3.06193763e-03, -4.61703911e-03, -3.23943049e-03
-                ], [
-                    -2.37237271e-02, -5.88475820e-03, -3.73612711e-05,
-                    2.01791357e-02, -6.75620860e-04, 1.06695695e-02,
-                    -1.42616741e-02, -2.69626491e-02, -1.45035451e-02
-                ], [
-                    1.12585640e-02, -6.92534202e-04, 9.88917705e-03,
-                    2.75237625e-03, -6.56115822e-03, -1.57997780e-03,
-                    -2.54477374e-03, 1.11598391e-02, 7.94144534e-03
-                ]],
-                dtype=float32)),
-        attention=array(
-            [[
-                0.00208854, 0.00213833, 0.00249781, -0.0035285, -0.00396897,
-                -0.00612035
-            ], [
-                0.00530425, -0.00217913, 0.0035951, -0.00297107, 0.0032645,
-                -0.00531189
-            ], [
-                -0.00112129, -0.00081516, -0.00201535, -0.00189556, -0.00504226,
-                -0.00937189
-            ], [
-                0.00102683, -0.00443788, -0.00143418, 0.00065628, -0.00217684,
-                0.00680128
-            ], [
-                0.00012445, 0.00022082, 0.00040711, 0.00021803, 0.0002734,
-                -0.00026981
-            ]],
-            dtype=float32),
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603),
         time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -737,116 +296,27 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_output,
         expected_final_state,
         attention_mechanism_depth=9,
-        name="testLuongScaled")
+        name='testLuongScaled')
 
   def testNotUseAttentionLayer(self):
     create_attention_mechanism = wrapper.BahdanauAttention
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=array(
-            [[[
-                -0.24223405, -0.07791166, 0.15451428, 0.24738294, 0.30900395,
-                -0.24685201, 0.04992372, 0.18749543, -0.15878429, -0.13678923
-            ], [
-                -0.2422339, -0.07791159, 0.15451418, 0.24738279, 0.30900383,
-                -0.24685188, 0.04992369, 0.18749531, -0.15878411, -0.13678911
-            ], [
-                -0.2422343, -0.07791215, 0.15451413, 0.24738336, 0.30900475,
-                -0.2468522, 0.04992349, 0.18749571, -0.158785, -0.13678965
-            ]], [[
-                0.40035266, 0.12299616, -0.06085059, -0.09197108, 0.11368551,
-                -0.15302914, 0.00566157, -0.26885766, 0.08546552, 0.18886778
-            ], [
-                0.40035242, 0.12299603, -0.06085056, -0.09197091, 0.11368536,
-                -0.15302882, 0.0056615, -0.26885763, 0.08546554, 0.18886763
-            ], [
-                0.40035242, 0.122996, -0.06085056, -0.09197087, 0.11368532,
-                -0.1530287, 0.00566146, -0.26885769, 0.08546556, 0.18886761
-            ]], [[
-                -0.4311333, 0.07519469, -0.01551808, 0.1913045, -0.02693807,
-                -0.21668895, -0.02155721, 0.0013397, 0.21180844, 0.25578707
-            ], [
-                -0.43113309, 0.07519454, -0.01551818, 0.19130446, -0.0269379,
-                -0.21668854, -0.021557, 0.00133975, 0.21180828, 0.25578681
-            ], [
-                -0.43113324, 0.07519463, -0.01551815, 0.1913045, -0.02693798,
-                -0.21668874, -0.02155712, 0.00133973, 0.21180835, 0.25578696
-            ]], [[
-                0.07059932, 0.16451572, 0.01174669, 0.04646531, 0.1427598,
-                0.0794456, -0.10852993, 0.15306188, 0.02151393, -0.05590061
-            ], [
-                0.07059933, 0.16451576, 0.01174669, 0.04646532, 0.14275983,
-                0.07944562, -0.10852996, 0.15306193, 0.02151394, -0.05590062
-            ], [
-                0.07059937, 0.16451585, 0.0117467, 0.04646534, 0.1427599,
-                0.07944567, -0.10853001, 0.153062, 0.02151395, -0.05590065
-            ]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.,
-                  0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
-                 [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]],
-            dtype=float32),
-        sample_id=array(
-            [[4, 4, 4], [0, 0, 0], [9, 9, 9], [1, 1, 1], [0, 0, 0]],
-            dtype=int32))
-
+        rnn_output=ResultSummary(
+            shape=(5, 3, 10), dtype=dtype('float32'), mean=0.019546926),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=2.7999999999999998))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=array(
-                [[
-                    -0.0181195, -0.01675365, -0.00510353, 0.01559796,
-                    -0.01251448, -0.00437002, -0.01243257, -0.01720199,
-                    0.02274928
-                ], [
-                    0.01259979, -0.00839985, -0.00374037, 0.03136262,
-                    -0.03486227, -0.02466441, -0.00496157, -0.00461032,
-                    -0.02098336
-                ], [
-                    -0.00781067, 0.00315682, -0.0138283, -0.01149793,
-                    0.00485562, -0.01343193, 0.0085915, -0.00632846, -0.01052086
-                ], [
-                    -0.04184828, -0.01223641, 0.0009445, 0.03911434, 0.0043249,
-                    0.02220661, -0.03006243, -0.05418363, -0.02615385
-                ], [
-                    0.02282745, -0.00143833, 0.01918138, 0.00545033,
-                    -0.01258384, -0.00303765, -0.00511231, 0.02166323,
-                    0.01638841
-                ]],
-                dtype=float32),
-            h=array(
-                [[
-                    -0.00910065, -0.00827571, -0.00259689, 0.00764857,
-                    -0.00635579, -0.00218579, -0.00633918, -0.00875511,
-                    0.01134532
-                ], [
-                    0.00626597, -0.004241, -0.00181303, 0.01597157, -0.0173375,
-                    -0.01224921, -0.00244522, -0.00231299, -0.0107822
-                ], [
-                    -0.00391383, 0.00162017, -0.00682621, -0.00570264,
-                    0.00244099, -0.00659772, 0.00426475, -0.00309861,
-                    -0.00520028
-                ], [
-                    -0.02087484, -0.00603306, 0.00047561, 0.01920062,
-                    0.00213875, 0.01115329, -0.0152659, -0.02687523, -0.01297523
-                ], [
-                    0.01138975, -0.00070959, 0.00986007, 0.0027323, -0.00645386,
-                    -0.00152054, -0.00257339, 0.01103063, 0.00800891
-                ]],
-                dtype=float32)),
-        attention=array(
-            [[
-                -0.2422343, -0.07791215, 0.15451413, 0.24738336, 0.30900475,
-                -0.2468522, 0.04992349, 0.18749571, -0.158785, -0.13678965
-            ], [
-                0.40035242, 0.122996, -0.06085056, -0.09197087, 0.11368532,
-                -0.1530287, 0.00566146, -0.26885769, 0.08546556, 0.18886761
-            ], [
-                -0.43113324, 0.07519463, -0.01551815, 0.1913045, -0.02693798,
-                -0.21668874, -0.02155712, 0.00133973, 0.21180835, 0.25578696
-            ], [
-                0.07059937, 0.16451585, 0.0117467, 0.04646534, 0.1427599,
-                0.07944567, -0.10853001, 0.153062, 0.02151395, -0.05590065
-            ], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
-            dtype=float32),
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0041728448),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.002085865)),
+        attention=ResultSummary(
+            shape=(5, 10), dtype=dtype('float32'), mean=0.019546915),
         time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -854,8 +324,8 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_output,
         expected_final_state,
         attention_layer_size=None,
-        name="testNotUseAttentionLayer")
+        name='testNotUseAttentionLayer')
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index 6b57293c6f72fc4eb3b68c091cfcd3d8a13106ca..8fc4ecfc82a14b9b4218a8818485ccbdc5274555 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -124,7 +124,7 @@ class BasicDecoderTest(test.TestCase):
     vocabulary_size = 7
     cell_depth = vocabulary_size  # cell's logits must match vocabulary size
     input_depth = 10
-    start_tokens = [0] * batch_size
+    start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
     end_token = 1
 
     with self.test_session(use_gpu=True) as sess:
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 04b38159bb9d03acb572378806bfd243e326c5f0..fd76882d8463283796631a9caa43c86e8af3b8bc 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -39,6 +39,7 @@ from tensorflow.python.util import nest
 
 
 __all__ = [
+    "AttentionMechanism",
     "AttentionWrapper",
     "AttentionWrapperState",
     "LuongAttention",
@@ -73,8 +74,9 @@ def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
   """
   memory = nest.map_structure(
       lambda m: ops.convert_to_tensor(m, name="memory"), memory)
-  memory_sequence_length = ops.convert_to_tensor(
-      memory_sequence_length, name="memory_sequence_length")
+  if memory_sequence_length is not None:
+    memory_sequence_length = ops.convert_to_tensor(
+        memory_sequence_length, name="memory_sequence_length")
   if check_inner_dims_defined:
     def _check_dims(m):
       if not m.get_shape()[2:].is_fully_defined():
@@ -119,8 +121,13 @@ class _BaseAttentionMechanism(AttentionMechanism):
     2. Preprocessing and storing the memory.
   """
 
-  def __init__(self, query_layer, memory, memory_sequence_length=None,
-               memory_layer=None, check_inner_dims_defined=True,
+  def __init__(self,
+               query_layer,
+               memory,
+               probability_fn,
+               memory_sequence_length=None,
+               memory_layer=None,
+               check_inner_dims_defined=True,
                name=None):
     """Construct base AttentionMechanism class.
 
@@ -130,6 +137,9 @@ class _BaseAttentionMechanism(AttentionMechanism):
         provided, the shape of `query` must match that of `memory_layer`.
       memory: The memory to query; usually the output of an RNN encoder.  This
         tensor should be shaped `[batch_size, max_time, ...]`.
+      probability_fn: A `callable`.  Converts the score and previous alignments
+        to probabilities. Its signature should be:
+        `probabilities = probability_fn(score, previous_alignments)`.
       memory_sequence_length (optional): Sequence lengths for the batch entries
         in memory.  If provided, the memory tensor rows are masked with zeros
         for values past the respective sequence lengths.
@@ -143,15 +153,19 @@ class _BaseAttentionMechanism(AttentionMechanism):
       name: Name to use when creating ops.
     """
     if (query_layer is not None
-        and not isinstance(query_layer, layers_base._Layer)):  # pylint: disable=protected-access
+        and not isinstance(query_layer, layers_base.Layer)):
       raise TypeError(
           "query_layer is not a Layer: %s" % type(query_layer).__name__)
     if (memory_layer is not None
-        and not isinstance(memory_layer, layers_base._Layer)):  # pylint: disable=protected-access
+        and not isinstance(memory_layer, layers_base.Layer)):
       raise TypeError(
           "memory_layer is not a Layer: %s" % type(memory_layer).__name__)
     self._query_layer = query_layer
     self._memory_layer = memory_layer
+    if not callable(probability_fn):
+      raise TypeError("probability_fn must be callable, saw type: %s" %
+                      type(probability_fn).__name__)
+    self._probability_fn = probability_fn
     with ops.name_scope(
         name, "BaseAttentionMechanismInit", nest.flatten(memory)):
       self._values = _prepare_memory(
@@ -162,6 +176,8 @@ class _BaseAttentionMechanism(AttentionMechanism):
           else self._values)
       self._batch_size = (
           self._keys.shape[0].value or array_ops.shape(self._keys)[0])
+      self._alignments_size = (self._keys.shape[1].value or
+                               array_ops.shape(self._keys)[1])
 
   @property
   def memory_layer(self):
@@ -183,6 +199,29 @@ class _BaseAttentionMechanism(AttentionMechanism):
   def batch_size(self):
     return self._batch_size
 
+  @property
+  def alignments_size(self):
+    return self._alignments_size
+
+  def initial_alignments(self, batch_size, dtype):
+    """Creates the initial alignment values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return a tensor of all zeros.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
+    max_time = self._alignments_size
+    return _zero_state_tensors(max_time, batch_size, dtype)
+
 
 class LuongAttention(_BaseAttentionMechanism):
   """Implements Luong-style (multiplicative) attention scoring.
@@ -206,6 +245,7 @@ class LuongAttention(_BaseAttentionMechanism):
                memory,
                memory_sequence_length=None,
                scale=False,
+               probability_fn=None,
                name="LuongAttention"):
     """Construct the AttentionMechanism mechanism.
 
@@ -217,31 +257,43 @@ class LuongAttention(_BaseAttentionMechanism):
         in memory.  If provided, the memory tensor rows are masked with zeros
         for values past the respective sequence lengths.
       scale: Python boolean.  Whether to scale the energy term.
+      probability_fn: (optional) A `callable`.  Converts the score to
+        probabilities.  The default is @{tf.nn.softmax}. Other options include
+        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
+        Its signature should be: `probabilities = probability_fn(score)`.
       name: Name to use when creating ops.
     """
     # For LuongAttention, we only transform the memory layer; thus
     # num_units **must** match expected the query depth.
+    if probability_fn is None:
+      probability_fn = nn_ops.softmax
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
     super(LuongAttention, self).__init__(
         query_layer=None,
         memory_layer=layers_core.Dense(
             num_units, name="memory_layer", use_bias=False),
         memory=memory,
+        probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
         name=name)
     self._num_units = num_units
     self._scale = scale
     self._name = name
 
-  def __call__(self, query):
+  def __call__(self, query, previous_alignments):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
+      previous_alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
 
     Returns:
-      score: Tensor of dtype matching `self.values` and shape
-        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
 
     Raises:
       ValueError: If `key` and `query` depths do not match.
@@ -279,7 +331,8 @@ class LuongAttention(_BaseAttentionMechanism):
             "attention_g", dtype=dtype, initializer=1.)
         score = g * score
 
-    return score
+    alignments = self._probability_fn(score, previous_alignments)
+    return alignments
 
 
 class BahdanauAttention(_BaseAttentionMechanism):
@@ -309,6 +362,7 @@ class BahdanauAttention(_BaseAttentionMechanism):
                memory,
                memory_sequence_length=None,
                normalize=False,
+               probability_fn=None,
                name="BahdanauAttention"):
     """Construct the Attention mechanism.
 
@@ -320,30 +374,42 @@ class BahdanauAttention(_BaseAttentionMechanism):
         in memory.  If provided, the memory tensor rows are masked with zeros
         for values past the respective sequence lengths.
       normalize: Python boolean.  Whether to normalize the energy term.
+      probability_fn: (optional) A `callable`.  Converts the score to
+        probabilities.  The default is @{tf.nn.softmax}. Other options include
+        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
+        Its signature should be: `probabilities = probability_fn(score)`.
       name: Name to use when creating ops.
     """
+    if probability_fn is None:
+      probability_fn = nn_ops.softmax
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
     super(BahdanauAttention, self).__init__(
         query_layer=layers_core.Dense(
             num_units, name="query_layer", use_bias=False),
         memory_layer=layers_core.Dense(
             num_units, name="memory_layer", use_bias=False),
         memory=memory,
+        probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
         name=name)
     self._num_units = num_units
     self._normalize = normalize
     self._name = name
 
-  def __call__(self, query):
+  def __call__(self, query, previous_alignments):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
+      previous_alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
 
     Returns:
-      score: Tensor of dtype matching `self.values` and shape
-        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
     """
     with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
@@ -371,20 +437,23 @@ class BahdanauAttention(_BaseAttentionMechanism):
         score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query),
                                     [2])
 
-    return score
+    alignments = self._probability_fn(score, previous_alignments)
+    return alignments
 
 
 class AttentionWrapperState(
     collections.namedtuple("AttentionWrapperState",
-                           ("cell_state", "attention", "time",
+                           ("cell_state", "attention", "time", "alignments",
                             "alignment_history"))):
   """`namedtuple` storing the state of a `AttentionWrapper`.
 
   Contains:
 
-    - `cell_state`: The state of the wrapped `RNNCell`.
+    - `cell_state`: The state of the wrapped `RNNCell` at the previous time
+      step.
     - `attention`: The attention emitted at the previous time step.
     - `time`: int32 scalar containing the current time step.
+    - `alignments`: The alignment emitted at the previous time step.
     - `alignment_history`: (if enabled) a `TensorArray` containing alignment
        matrices from all time steps.  Call `stack()` to convert to a `Tensor`.
   """
@@ -441,7 +510,6 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
                attention_layer_size=None,
                alignment_history=False,
                cell_input_fn=None,
-               probability_fn=None,
                output_attention=True,
                initial_cell_state=None,
                name=None):
@@ -459,9 +527,6 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
         time major `TensorArray` on which you must call `stack()`).
       cell_input_fn: (optional) A `callable`.  The default is:
         `lambda inputs, attention: array_ops.concat([inputs, attention], -1)`.
-      probability_fn: (optional) A `callable`.  Converts the score to
-        probabilities.  The default is @{tf.nn.softmax}. Other options include
-        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
       output_attention: Python bool.  If `True` (default), the output at each
         time step is the attention value.  This is the behavior of Luong-style
         attention mechanisms.  If `False`, the output at each time step is
@@ -477,8 +542,8 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
         behavior is not guaranteed.
       name: Name to use when creating ops.
     """
-    super(AttentionWrapper, self).__init__()
-    if not isinstance(cell, core_rnn_cell.RNNCell):
+    super(AttentionWrapper, self).__init__(name=name)
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError(
           "cell must be an RNNCell, saw type: %s" % type(cell).__name__)
     if not isinstance(attention_mechanism, AttentionMechanism):
@@ -493,13 +558,6 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
         raise TypeError(
             "cell_input_fn must be callable, saw type: %s"
             % type(cell_input_fn).__name__)
-    if probability_fn is None:
-      probability_fn = nn_ops.softmax
-    else:
-      if not callable(cell_input_fn):
-        raise TypeError(
-            "probability_fn must be callable, saw type: %s"
-            % type(probability_fn).__name__)
 
     if attention_layer_size is not None:
       self._attention_layer = layers_core.Dense(
@@ -512,7 +570,6 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
     self._cell = cell
     self._attention_mechanism = attention_mechanism
     self._cell_input_fn = cell_input_fn
-    self._probability_fn = probability_fn
     self._output_attention = output_attention
     self._alignment_history = alignment_history
     with ops.name_scope(name, "AttentionWrapperInit"):
@@ -551,6 +608,7 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
         cell_state=self._cell.state_size,
         time=tensor_shape.TensorShape([]),
         attention=self._attention_size,
+        alignments=self._attention_mechanism.alignments_size,
         alignment_history=())  # alignment_history is sometimes a TensorArray
 
   def zero_state(self, batch_size, dtype):
@@ -584,6 +642,8 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
           time=array_ops.zeros([], dtype=dtypes.int32),
           attention=_zero_state_tensors(self._attention_size, batch_size,
                                         dtype),
+          alignments=self._attention_mechanism.initial_alignments(
+              batch_size, dtype),
           alignment_history=alignment_history)
 
   def call(self, inputs, state):
@@ -613,64 +673,64 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
       - `next_state` is an instance of `DynamicAttentionWrapperState`
          containing the state calculated at this time step.
     """
-    with variable_scope.variable_scope("attention"):
-      # Step 1: Calculate the true inputs to the cell based on the
-      # previous attention value.
-      cell_inputs = self._cell_input_fn(inputs, state.attention)
-      cell_state = state.cell_state
-      cell_output, next_cell_state = self._cell(cell_inputs, cell_state)
-
-      cell_batch_size = (
-          cell_output.shape[0].value or array_ops.shape(cell_output)[0])
-      error_message = (
-          "When applying AttentionWrapper %s: " % self.name +
-          "Non-matching batch sizes between the memory "
-          "(encoder output) and the query (decoder output).  Are you using "
-          "the BeamSearchDecoder?  You may need to tile your memory input via "
-          "the tf.contrib.seq2seq.tile_batch function with argument "
-          "multiple=beam_width.")
-      with ops.control_dependencies(
-          [check_ops.assert_equal(cell_batch_size,
-                                  self._attention_mechanism.batch_size,
-                                  message=error_message)]):
-        cell_output = array_ops.identity(
-            cell_output, name="checked_cell_output")
-
-      score = self._attention_mechanism(cell_output)
-      alignments = self._probability_fn(score)
-
-      # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
-      expanded_alignments = array_ops.expand_dims(alignments, 1)
-      # Context is the inner product of alignments and values along the
-      # memory time dimension.
-      # alignments shape is
-      #   [batch_size, 1, memory_time]
-      # attention_mechanism.values shape is
-      #   [batch_size, memory_time, attention_mechanism.num_units]
-      # the batched matmul is over memory_time, so the output shape is
-      #   [batch_size, 1, attention_mechanism.num_units].
-      # we then squeeze out the singleton dim.
-      attention_mechanism_values = self._attention_mechanism.values
-      context = math_ops.matmul(expanded_alignments, attention_mechanism_values)
-      context = array_ops.squeeze(context, [1])
-
-      if self._attention_layer is not None:
-        attention = self._attention_layer(
-            array_ops.concat([cell_output, context], 1))
-      else:
-        attention = context
-
-      if self._alignment_history:
-        alignment_history = state.alignment_history.write(
-            state.time, alignments)
-      else:
-        alignment_history = ()
+    # Step 1: Calculate the true inputs to the cell based on the
+    # previous attention value.
+    cell_inputs = self._cell_input_fn(inputs, state.attention)
+    cell_state = state.cell_state
+    cell_output, next_cell_state = self._cell(cell_inputs, cell_state)
+
+    cell_batch_size = (
+        cell_output.shape[0].value or array_ops.shape(cell_output)[0])
+    error_message = (
+        "When applying AttentionWrapper %s: " % self.name +
+        "Non-matching batch sizes between the memory "
+        "(encoder output) and the query (decoder output).  Are you using "
+        "the BeamSearchDecoder?  You may need to tile your memory input via "
+        "the tf.contrib.seq2seq.tile_batch function with argument "
+        "multiple=beam_width.")
+    with ops.control_dependencies(
+        [check_ops.assert_equal(cell_batch_size,
+                                self._attention_mechanism.batch_size,
+                                message=error_message)]):
+      cell_output = array_ops.identity(
+          cell_output, name="checked_cell_output")
+
+    alignments = self._attention_mechanism(
+        cell_output, previous_alignments=state.alignments)
+
+    # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
+    expanded_alignments = array_ops.expand_dims(alignments, 1)
+    # Context is the inner product of alignments and values along the
+    # memory time dimension.
+    # alignments shape is
+    #   [batch_size, 1, memory_time]
+    # attention_mechanism.values shape is
+    #   [batch_size, memory_time, attention_mechanism.num_units]
+    # the batched matmul is over memory_time, so the output shape is
+    #   [batch_size, 1, attention_mechanism.num_units].
+    # we then squeeze out the singleton dim.
+    attention_mechanism_values = self._attention_mechanism.values
+    context = math_ops.matmul(expanded_alignments, attention_mechanism_values)
+    context = array_ops.squeeze(context, [1])
+
+    if self._attention_layer is not None:
+      attention = self._attention_layer(
+          array_ops.concat([cell_output, context], 1))
+    else:
+      attention = context
 
-      next_state = AttentionWrapperState(
-          time=state.time + 1,
-          cell_state=next_cell_state,
-          attention=attention,
-          alignment_history=alignment_history)
+    if self._alignment_history:
+      alignment_history = state.alignment_history.write(
+          state.time, alignments)
+    else:
+      alignment_history = ()
+
+    next_state = AttentionWrapperState(
+        time=state.time + 1,
+        cell_state=next_cell_state,
+        attention=attention,
+        alignments=alignments,
+        alignment_history=alignment_history)
 
     if self._output_attention:
       return attention, next_state
diff --git a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
index e73a637027c8f07eb52b6063b0f6374e95a164cc..8ae175b6b59a88b0516326967dd03c419957545e 100644
--- a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
@@ -21,13 +21,13 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.contrib.rnn import core_rnn_cell
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base as layers_base
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.util import nest
 
 
@@ -60,12 +60,12 @@ class BasicDecoder(decoder.Decoder):
     Raises:
       TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
     """
-    if not isinstance(cell, core_rnn_cell.RNNCell):
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
     if not isinstance(helper, helper_py.Helper):
       raise TypeError("helper must be a Helper, received: %s" % type(helper))
     if (output_layer is not None
-        and not isinstance(output_layer, layers_base._Layer)):  # pylint: disable=protected-access
+        and not isinstance(output_layer, layers_base.Layer)):
       raise TypeError(
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index 289da8e6aec5ab59adc9920832181f5afe9b8c9c..c9be517fadc69407d289eb89201bedcd32cd28a4 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.contrib.rnn import core_rnn_cell
 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.python.framework import dtypes
@@ -33,6 +32,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 
@@ -143,10 +143,10 @@ class BeamSearchDecoder(decoder.Decoder):
       ValueError: If `start_tokens` is not a vector or
         `end_token` is not a scalar.
     """
-    if not isinstance(cell, core_rnn_cell.RNNCell):
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
     if (output_layer is not None
-        and not isinstance(output_layer, layers_base._Layer)):  # pylint: disable=protected-access
+        and not isinstance(output_layer, layers_base.Layer)):
       raise TypeError(
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
@@ -452,10 +452,12 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
     time: Beam search time step, should start at 0. At time 0 we assume
       that all beams are equal and consider only the first beam for
       continuations.
-    logits: Logits at the current time step. A tensor of shape `[B, vocab_size]`
-    beam_state: Current state of the beam search. An instance of `BeamState`
+    logits: Logits at the current time step. A tensor of shape
+      `[batch_size, beam_width, vocab_size]`
+    beam_state: Current state of the beam search.
+      An instance of `BeamSearchDecoderState`.
     batch_size: The batch size for this input.
-    beam_width: The size of the beams.
+    beam_width: Python int.  The size of the beams.
     end_token: The int32 end token.
     length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
 
@@ -470,20 +472,22 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
 
   # Calculate the total log probs for the new hypotheses
   # Final Shape: [batch_size, beam_width, vocab_size]
-  probs = nn_ops.log_softmax(logits)
-  probs = _mask_probs(probs, end_token, previously_finished)
-  total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + probs
+  step_log_probs = nn_ops.log_softmax(logits)
+  step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished)
+  total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs
 
   # Calculate the continuation lengths by adding to all continuing beams.
-  vocab_size = logits.get_shape().as_list()[-1]
+  vocab_size = logits.shape[-1].value
   lengths_to_add = array_ops.one_hot(
-      array_ops.tile(
+      indices=array_ops.tile(
           array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]),
-      vocab_size, 0, 1)
+      depth=vocab_size,
+      on_value=0,
+      off_value=1)
   add_mask = (1 - math_ops.to_int32(previously_finished))
   lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add
-  new_prediction_lengths = array_ops.expand_dims(prediction_lengths,
-                                                 2) + lengths_to_add
+  new_prediction_lengths = (
+      lengths_to_add + array_ops.expand_dims(prediction_lengths, 2))
 
   # Calculate the scores for each beam
   scores = _get_scores(
@@ -491,14 +495,24 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
       sequence_lengths=new_prediction_lengths,
       length_penalty_weight=length_penalty_weight)
 
-  scores_flat = array_ops.reshape(scores, [batch_size, -1])
+  time = ops.convert_to_tensor(time, name="time")
   # During the first time step we only consider the initial beam
+  scores_shape = array_ops.shape(scores)
   scores_flat = control_flow_ops.cond(
-      ops.convert_to_tensor(time) > 0, lambda: scores_flat,
+      time > 0,
+      lambda: array_ops.reshape(scores, [batch_size, -1]),
       lambda: scores[:, 0])
+  num_available_beam = control_flow_ops.cond(
+      time > 0,
+      lambda: math_ops.reduce_prod(scores_shape[1:]),
+      lambda: math_ops.reduce_prod(scores_shape[2:]))
 
   # Pick the next beams according to the specified successors function
-  next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=beam_width)
+  next_beam_size = math_ops.minimum(
+      ops.convert_to_tensor(
+          beam_width, dtype=dtypes.int32, name="beam_width"),
+      num_available_beam)
+  next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size)
   next_beam_scores.set_shape([static_batch_size, beam_width])
   word_indices.set_shape([static_batch_size, beam_width])
 
@@ -556,7 +570,8 @@ def _get_scores(log_probs, sequence_lengths, length_penalty_weight):
   """Calculates scores for beam search hypotheses.
 
   Args:
-    log_probs: The log probabilities with shape [batch_size, beam_width].
+    log_probs: The log probabilities with shape
+      `[batch_size, beam_width, vocab_size]`.
     sequence_lengths: The array of sequence lengths.
     length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
 
@@ -579,7 +594,11 @@ def _length_penalty(sequence_lengths, penalty_factor):
   Returns:
     The length penalty factor, a tensor fo shape [beam_size].
   """
-  # TODO(ebrevdo): cleanup based on constant-value of penalty_factor.
+  penalty_factor = ops.convert_to_tensor(penalty_factor, name="penalty_factor")
+  penalty_factor.set_shape(())  # penalty should be a scalar.
+  static_penalty = tensor_util.constant_value(penalty_factor)
+  if static_penalty is not None and static_penalty == 0:
+    return 1.0
   return math_ops.div((5. + math_ops.to_float(sequence_lengths))
                       **penalty_factor, (5. + 1.)**penalty_factor)
 
@@ -613,9 +632,9 @@ def _mask_probs(probs, eos_token, finished):
   finished_row = array_ops.one_hot(
       eos_token,
       vocab_size,
-      dtype=dtypes.float32,
+      dtype=probs.dtype,
       on_value=0.,
-      off_value=dtypes.float32.min)
+      off_value=probs.dtype.min)
   finished_examples = (1. - finished_mask) * finished_row
   return finished_examples + non_finished_examples
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index ff705715e01b84a3b2cfbc7eb655d9c3bd3206fb..4795dfb8c91bf83dc8642a9cb760043e75143a5d 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -107,6 +107,9 @@ class Decoder(object):
     """
     raise NotImplementedError
 
+  def finalize(self, outputs, final_state, sequence_lengths):
+    raise NotImplementedError
+
 
 def _create_zero_outputs(size, dtype, batch_size):
   """Create a zero outputs Tensor structure."""
@@ -164,7 +167,7 @@ def dynamic_decode(decoder,
     raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
                     type(decoder))
 
-  with variable_scope.variable_scope(scope or "decoder") as varscope:
+  with variable_scope.variable_scope(scope, "decoder") as varscope:
     # Properly cache variable values inside the while_loop
     if varscope.caching_device is None:
       varscope.set_caching_device(lambda op: op.device)
@@ -288,9 +291,11 @@ def dynamic_decode(decoder,
 
     final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)
 
-    if hasattr(decoder, "finalize"):
+    try:
       final_outputs, final_state = decoder.finalize(
           final_outputs, final_state, final_sequence_lengths)
+    except NotImplementedError:
+      pass
 
     if not output_time_major:
       final_outputs = nest.map_structure(_transpose_batch_time, final_outputs)
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index e2d56063a29dd24044d8dd400f1a194446cf64cb..bdd7d7ca73e2cecc777ff610a9ff89c97990ebe4 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -23,8 +23,6 @@ import abc
 
 import six
 
-from tensorflow.contrib.distributions.python.ops import bernoulli
-from tensorflow.contrib.distributions.python.ops import categorical
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,6 +33,8 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.distributions import bernoulli
+from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.util import nest
 
 __all__ = [
@@ -363,7 +363,7 @@ class ScheduledOutputTrainingHelper(TrainingHelper):
       self._seed = seed
 
       if (next_input_layer is not None and not isinstance(next_input_layer,
-                                                          layers_base._Layer)):  # pylint: disable=protected-access
+                                                          layers_base.Layer)):
         raise TypeError("next_input_layer must be a Layer, received: %s" %
                         type(next_input_layer))
       self._next_input_layer = next_input_layer
diff --git a/tensorflow/contrib/session_bundle/README.md b/tensorflow/contrib/session_bundle/README.md
index 6df63cba807b5a121481aa8f7ee1e391c9b57b7c..5bcc8fab70f8f492f687fa37b022ee324429f530 100644
--- a/tensorflow/contrib/session_bundle/README.md
+++ b/tensorflow/contrib/session_bundle/README.md
@@ -1,5 +1,8 @@
 # TensorFlow Inference Model Format
 
+WARNING: SessionBundle has been deprecated. Please use
+[SavedModel](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) instead.
+
 [TOC]
 
 ## Overview
diff --git a/tensorflow/contrib/session_bundle/example/export_half_plus_two.py b/tensorflow/contrib/session_bundle/example/export_half_plus_two.py
index 08ca47058c8e563befcd6ef1f924fee242265e44..4a56509e596d1308a4e07a31965e44d03d26aa3d 100644
--- a/tensorflow/contrib/session_bundle/example/export_half_plus_two.py
+++ b/tensorflow/contrib/session_bundle/example/export_half_plus_two.py
@@ -97,7 +97,7 @@ def Export(export_dir, use_checkpoint_v2):
     }
 
     # Create two filename assets and corresponding tensors.
-    # TODO(b/26254158) Consider adding validation of file existance as well as
+    # TODO(b/26254158) Consider adding validation of file existence as well as
     # hashes (e.g. sha1) for consistency.
     original_filename1 = tf.constant("hello1.txt")
     tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, original_filename1)
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.cc b/tensorflow/contrib/session_bundle/session_bundle_test.cc
index fc80b9bec796547496bbcceab34a806058352d5a..ad6264d5c8aa159e579092da0443d83438452b21 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.cc
@@ -275,7 +275,7 @@ class SessionBundleTest : public ::testing::Test {
   }
   // SetupExport that allows for the variables and meta_graph_def filenames
   // to be overridden.
-  string SetupExport(MetaGraphDefTwiddler twiddler,
+  string SetupExport(const MetaGraphDefTwiddler& twiddler,
                      const string& variables_filename,
                      const string& meta_graph_def_filename) {
     // Construct a unique path name based on the test name.
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5b65a6ae05ed98eb0ac5218c804eca37ea4743e6
--- /dev/null
+++ b/tensorflow/contrib/signal/BUILD
@@ -0,0 +1,46 @@
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+py_library(
+    name = "signal_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+cuda_py_tests(
+    name = "shape_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/shape_ops_test.py"],
+    additional_deps = [
+        ":signal_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/identity.py b/tensorflow/contrib/signal/__init__.py
similarity index 75%
rename from tensorflow/contrib/distributions/python/ops/bijectors/identity.py
rename to tensorflow/contrib/signal/__init__.py
index 749dd268f98afafefd15c0a417c6ae49a62d124d..9f906dd28e8dc9130d87f4cd4a126e033fa66293 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/identity.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -12,18 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Identity bijector."""
+"""##Signal ops.
+
+@@frames
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.identity_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ["Identity"]
+from tensorflow.contrib.signal.python.ops.shape_ops import frames
 
-remove_undocumented(__name__, _allowed_symbols)
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/signal/python/__init__.py b/tensorflow/contrib/signal/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e672d1146c53a813613c9076c0cb6056f7081441
--- /dev/null
+++ b/tensorflow/contrib/signal/python/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e07942875fdf3d0266824cf546a2a9dda94b1877
--- /dev/null
+++ b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
@@ -0,0 +1,68 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for shape_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.signal.python.ops import shape_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class FramesTest(test.TestCase):
+
+  def test_mapping_of_indices_without_padding(self):
+    with self.test_session():
+      tensor = constant_op.constant(np.arange(9152), dtypes.int32)
+      tensor = array_ops.expand_dims(tensor, 0)
+
+      result = shape_ops.frames(tensor, 512, 180)
+      result = result.eval()
+
+      expected = np.tile(np.arange(512), (49, 1))
+      expected += np.tile(np.arange(49) * 180, (512, 1)).T
+
+      expected = np.expand_dims(expected, axis=0)
+      expected = np.array(expected, dtype=np.int32)
+
+      self.assertAllEqual(expected, result)
+
+  def test_mapping_of_indices_with_padding(self):
+    with self.test_session():
+      tensor = constant_op.constant(np.arange(10000), dtypes.int32)
+      tensor = array_ops.expand_dims(tensor, 0)
+
+      result = shape_ops.frames(tensor, 512, 192)
+      result = result.eval()
+
+      expected = np.tile(np.arange(512), (51, 1))
+      expected += np.tile(np.arange(51) * 192, (512, 1)).T
+
+      expected[expected >= 10000] = 0
+
+      expected = np.expand_dims(expected, axis=0)
+      expected = np.array(expected, dtype=np.int32)
+
+      self.assertAllEqual(expected, result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/signal/python/ops/__init__.py b/tensorflow/contrib/signal/python/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e672d1146c53a813613c9076c0cb6056f7081441
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/signal/python/ops/shape_ops.py b/tensorflow/contrib/signal/python/ops/shape_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4914f19be75398d50dc47fad0e8d7ab42e7d44aa
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/shape_ops.py
@@ -0,0 +1,87 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""General shape ops for frames."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def frames(signal, frame_length, frame_step, name=None):
+  """Frame a signal into overlapping frames.
+
+  May be used in front of spectral functions.
+
+  For example:
+
+  ```python
+  pcm = tf.placeholder(tf.float32, [None, 9152])
+  frames = tf.contrib.signal.frames(pcm, 512, 180)
+  magspec = tf.abs(tf.spectral.rfft(frames, [512]))
+  image = tf.expand_dims(magspec, 3)
+  ```
+
+  Args:
+    signal: A `Tensor` of shape `[batch_size, signal_length]`.
+    frame_length: An `int32` or `int64` `Tensor`. The length of each frame.
+    frame_step: An `int32` or `int64` `Tensor`. The step between frames.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of frames with shape `[batch_size, num_frames, frame_length]`.
+
+  Raises:
+    ValueError: if signal does not have rank 2.
+  """
+  with ops.name_scope(name, "frames", [signal, frame_length, frame_step]):
+    signal = ops.convert_to_tensor(signal, name="signal")
+    frame_length = ops.convert_to_tensor(frame_length, name="frame_length")
+    frame_step = ops.convert_to_tensor(frame_step, name="frame_step")
+
+    signal_rank = signal.shape.ndims
+
+    if signal_rank != 2:
+      raise ValueError("expected signal to have rank 2 but was " + signal_rank)
+
+    signal_length = array_ops.shape(signal)[1]
+
+    num_frames = math_ops.ceil((signal_length - frame_length) / frame_step)
+    num_frames = 1 + math_ops.cast(num_frames, dtypes.int32)
+
+    pad_length = (num_frames - 1) * frame_step + frame_length
+    pad_signal = array_ops.pad(signal, [[0, 0], [0,
+                                                 pad_length - signal_length]])
+
+    indices_frame = array_ops.expand_dims(math_ops.range(frame_length), 0)
+    indices_frames = array_ops.tile(indices_frame, [num_frames, 1])
+
+    indices_step = array_ops.expand_dims(
+        math_ops.range(num_frames) * frame_step, 1)
+    indices_steps = array_ops.tile(indices_step, [1, frame_length])
+
+    indices = indices_frames + indices_steps
+
+    # TODO(androbin): remove `transpose` when `gather` gets `axis` support
+    pad_signal = array_ops.transpose(pad_signal)
+    signal_frames = array_ops.gather(pad_signal, indices)
+    signal_frames = array_ops.transpose(signal_frames, perm=[2, 0, 1])
+
+    return signal_frames
diff --git a/tensorflow/contrib/slim/python/slim/data/README.md b/tensorflow/contrib/slim/python/slim/data/README.md
index 858c69499023311bec37b20b68d5015d25663bef..fe15a10b99dcac384268986d012bafd70b3d360d 100644
--- a/tensorflow/contrib/slim/python/slim/data/README.md
+++ b/tensorflow/contrib/slim/python/slim/data/README.md
@@ -71,27 +71,27 @@ for item in data_decoder.list_items():
   print(item)
 ```
 
-## Example: TFExampleDataDecoder
+## Example: TFExampleDecoder
 
 The
-[tfexample_data_decoder.py](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/tfexample_data_decoder.py)
+[tfexample_decoder.py](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py)
 is a data decoder which decodes serialized `TFExample` protocol buffers. A
 `TFExample` protocol buffer is a map from keys (strings) to either a
 `tf.FixedLenFeature` or `tf.VarLenFeature`. Consequently, to decode a
 `TFExample`, one must provide a mapping from one or more `TFExample` fields
-to each of the `items` that the `tfexample_data_decoder` can provide. For
+to each of the `items` that the `tfexample_decoder` can provide. For
 example, a dataset of `TFExamples` might store images in various formats and
 each `TFExample` might contain an `encoding` key and a `format` key which can
 be used to decode the image using the appropriate decoder (jpg, png, etc).
 
-To make this possible, the `tfexample_data_decoder` is constructed by specifying
+To make this possible, the `tfexample_decoder` is constructed by specifying
 the a map of `TFExample` keys to either `tf.FixedLenFeature` or
 `tf.VarLenFeature` as well as a set of `ItemHandlers`. An `ItemHandler`
 provides a mapping from `TFExample` keys to the item being provided. Because a
-`tfexample_data_decoder` might return multiple `items`, one often constructs a
-`tfexample_data_decoder` using multiple `ItemHandlers`.
+`tfexample_decoder` might return multiple `items`, one often constructs a
+`tfexample_decoder` using multiple `ItemHandlers`.
 
-`tfexample_data_decoder` provides some predefined `ItemHandlers` which take care
+`tfexample_decoder` provides some predefined `ItemHandlers` which take care
 of the common cases of mapping `TFExamples` to images, `Tensors` and
 `SparseTensors`. For example, the following specification might be
 used to decode a dataset of images:
diff --git a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
index f1b425aab7300f528fe78485a5687b095da74cac..3a78c0471d37a041ec00edc3c6e16fd2f2335a9a 100644
--- a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
+++ b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
@@ -59,7 +59,8 @@ class DatasetDataProvider(data_provider.DataProvider):
                common_queue_capacity=256,
                common_queue_min=128,
                record_key='record_key',
-               seed=None):
+               seed=None,
+               scope=None):
     """Creates a DatasetDataProvider.
 
     Args:
@@ -76,6 +77,7 @@ class DatasetDataProvider(data_provider.DataProvider):
       record_key: The item name to use for the dataset record keys in the
         provided tensors.
       seed: The seed to use if shuffling.
+      scope: Optional name scope for the ops.
     Raises:
       ValueError: If `record_key` matches one of the items in the dataset.
     """
@@ -88,7 +90,8 @@ class DatasetDataProvider(data_provider.DataProvider):
         shuffle=shuffle,
         capacity=common_queue_capacity,
         min_after_dequeue=common_queue_min,
-        seed=seed)
+        seed=seed,
+        scope=scope)
 
     items = dataset.decoder.list_items()
     tensors = dataset.decoder.decode(data, items)
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index 1e24f84b22c3012ff9bbf9b0400eb3ce9248b131..f0e028cd778865267340373cc72c1097488e4bcd 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
@@ -274,7 +275,8 @@ class Image(ItemHandler):
                format_key=None,
                shape=None,
                channels=3,
-               dtype=dtypes.uint8):
+               dtype=dtypes.uint8,
+               repeated=False):
     """Initializes the image.
 
     Args:
@@ -289,9 +291,10 @@ class Image(ItemHandler):
       channels: the number of channels in the image.
       dtype: images will be decoded at this bit depth. Different formats
         support different bit depths.
-          See tf.image.decode_png,
+          See tf.image.decode_image,
               tf.decode_raw,
-              tf.image.decode_jpeg: only supports tf.uint8
+      repeated: if False, decodes a single image. If True, decodes a
+        variable number of image strings from a 1D tensor of strings.
     """
     if not image_key:
       image_key = 'image/encoded'
@@ -304,61 +307,47 @@ class Image(ItemHandler):
     self._shape = shape
     self._channels = channels
     self._dtype = dtype
+    self._repeated = repeated
 
   def tensors_to_item(self, keys_to_tensors):
     """See base class."""
     image_buffer = keys_to_tensors[self._image_key]
     image_format = keys_to_tensors[self._format_key]
 
-    return self._decode(image_buffer, image_format)
+    if self._repeated:
+      return functional_ops.map_fn(lambda x: self._decode(x, image_format),
+                                   image_buffer, dtype=self._dtype)
+    else:
+      return self._decode(image_buffer, image_format)
 
   def _decode(self, image_buffer, image_format):
     """Decodes the image buffer.
 
     Args:
       image_buffer: The tensor representing the encoded image tensor.
-      image_format: The image format for the image in `image_buffer`.
+      image_format: The image format for the image in `image_buffer`. If image
+        format is `raw`, all images are expected to be in this format, otherwise
+        this op can decode a mix of `jpg` and `png` formats.
 
     Returns:
       A tensor that represents decoded image of self._shape, or
       (?, ?, self._channels) if self._shape is not specified.
     """
-
-    def decode_png():
-      return image_ops.decode_png(
-          image_buffer, self._channels, dtype=self._dtype)
+    def decode_image():
+      """Decodes a png or jpg based on the headers."""
+      return image_ops.decode_image(image_buffer, self._channels)
 
     def decode_raw():
+      """Decodes a raw image."""
       return parsing_ops.decode_raw(image_buffer, out_type=self._dtype)
 
-    def decode_jpg():
-      if self._dtype != dtypes.uint8:
-        raise ValueError(
-            'jpeg decoder can only be used to decode to tf.uint8 but %s was '
-            'requested for a jpeg image.' % self._dtype)
-      return image_ops.decode_jpeg(image_buffer, self._channels)
-
-    # For RGBA images JPEG is not a valid decoder option.
-    if self._channels > 3:
-      pred_fn_pairs = {
-          math_ops.logical_or(
-              math_ops.equal(image_format, 'raw'),
-              math_ops.equal(image_format, 'RAW')): decode_raw,
-      }
-      default_decoder = decode_png
-    else:
-      pred_fn_pairs = {
-          math_ops.logical_or(
-              math_ops.equal(image_format, 'png'),
-              math_ops.equal(image_format, 'PNG')): decode_png,
-          math_ops.logical_or(
-              math_ops.equal(image_format, 'raw'),
-              math_ops.equal(image_format, 'RAW')): decode_raw,
-      }
-      default_decoder = decode_jpg
-
+    pred_fn_pairs = {
+        math_ops.logical_or(
+            math_ops.equal(image_format, 'raw'),
+            math_ops.equal(image_format, 'RAW')): decode_raw,
+    }
     image = control_flow_ops.case(
-        pred_fn_pairs, default=default_decoder, exclusive=True)
+        pred_fn_pairs, default=decode_image, exclusive=True)
 
     image.set_shape([None, None, self._channels])
     if self._shape is not None:
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
index dd3c6a39a244e66b9a7860e2b6702969a7de7038..506f4bd8777dd4229f8e76f424b87f899608e386 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
@@ -228,9 +228,7 @@ class TFExampleDecoderTest(test.TestCase):
     image_shape = (2, 3, 3)
     unused_image, serialized_example = self.GenerateImage(
         image_format='jpeg', image_shape=image_shape)
-    expected_regex = ('jpeg decoder can only be used to decode to tf.uint8 but '
-                      '.* was requested for a jpeg image.')
-    with self.assertRaisesRegexp(ValueError, expected_regex):
+    with self.assertRaises(TypeError):
       unused_decoded_image = self.RunDecodeExample(
           serialized_example,
           tfexample_decoder.Image(dtype=dtypes.uint16),
@@ -730,6 +728,43 @@ class TFExampleDecoderTest(test.TestCase):
 
     self.assertAllClose(np_bboxes, bboxes)
 
+  def testDecodeExampleWithRepeatedImages(self):
+    image_shape = (2, 3, 3)
+    image_format = 'png'
+    image, _ = self.GenerateImage(
+        image_format=image_format, image_shape=image_shape)
+    tf_encoded = self._Encoder(image, image_format)
+    with self.test_session():
+      tf_string = tf_encoded.eval()
+
+    example = example_pb2.Example(features=feature_pb2.Features(feature={
+        'image/encoded': feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+            value=[tf_string, tf_string])),
+        'image/format': self._StringFeature(image_format),
+    }))
+    serialized_example = example.SerializeToString()
+
+    with self.test_session():
+      serialized_example = array_ops.reshape(serialized_example, shape=[])
+
+      decoder = tfexample_decoder.TFExampleDecoder(
+          keys_to_features={
+              'image/encoded':
+                  parsing_ops.FixedLenFeature(
+                      (2,), dtypes.string),
+              'image/format':
+                  parsing_ops.FixedLenFeature(
+                      (), dtypes.string, default_value=image_format),
+          },
+          items_to_handlers={'image': tfexample_decoder.Image(repeated=True)})
+      [tf_image] = decoder.decode(serialized_example, ['image'])
+
+      output_image = tf_image.eval()
+
+      self.assertEqual(output_image.shape, (2, 2, 3, 3))
+      self.assertAllEqual(np.squeeze(output_image[0, :, :, :]), image)
+      self.assertAllEqual(np.squeeze(output_image[1, :, :, :]), image)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 5ced8a4f08930edec34583e11cef517bb8bb3328..43a1193d13568f846608c8fc118d135bb675f8ca 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -261,7 +261,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
@@ -657,7 +657,7 @@ def train(train_op,
       if local_init_op == _USE_DEFAULT:
         local_init_op = control_flow_ops.group(
             tf_variables.local_variables_initializer(),
-            data_flow_ops.tables_initializer())
+            lookup_ops.tables_initializer())
 
       if sync_optimizer is not None and isinstance(
           sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
@@ -696,8 +696,9 @@ def train(train_op,
         else:
           should_stop_op = constant_op.constant(False)
         train_step_kwargs['should_stop'] = should_stop_op
-        train_step_kwargs['should_log'] = math_ops.equal(
-            math_ops.mod(global_step, log_every_n_steps), 0)
+        if log_every_n_steps > 0:
+          train_step_kwargs['should_log'] = math_ops.equal(
+              math_ops.mod(global_step, log_every_n_steps), 0)
         if is_chief and trace_every_n_steps is not None:
           train_step_kwargs['should_trace'] = math_ops.equal(
               math_ops.mod(global_step, trace_every_n_steps), 0)
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index cf3a878450d776e9d0c94892d63dbb9f2f803200..83d45f6f5adaccfca0a04629172ee803bab10ba7 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -840,7 +840,7 @@ class TrainTest(test.TestCase):
         # Initialize the variables.
         sess.run(variables_lib.global_variables_initializer())
 
-        # Get the intial weights and biases values.
+        # Get the initial weights and biases values.
         weights_values, biases_values = sess.run([weights, biases])
         self.assertGreater(np.linalg.norm(weights_values), 0)
         self.assertAlmostEqual(np.linalg.norm(biases_values), 0)
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
index bd811e3726a2393a80a311f230f0faadc44609e1..63e8f1ff356dfcf0427d5170a03faa47ee06298c 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
@@ -25,8 +25,6 @@ introduced by:
 
 The key difference of the full preactivation 'v2' variant compared to the
 'v1' variant in [1] is the use of batch normalization before every weight layer.
-Another difference is that 'v2' ResNets do not include an activation function in
-the main pathway. Also see [2; Fig. 4e].
 
 Typical use:
 
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
index 89dbcd96f8640f470293c271250b0d44d2aabf7c..c8b4e472c99e0bf081a7222a7976b1fbbb680825 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
@@ -159,7 +159,7 @@ class SparsemaxLossTest(test.TestCase):
     self.assertShapeEqual(q, tf_sparsemax_op)
 
   def _test_gradient_against_estimate(self, dtype, random, use_gpu):
-    """check sparsemax-loss Rop, aginst estimated-loss Rop"""
+    """check sparsemax-loss Rop, against estimated-loss Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
     q = np.zeros((test_obs, 10)).astype(dtype)
     q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
@@ -178,7 +178,7 @@ class SparsemaxLossTest(test.TestCase):
     self.assertLess(err, 1e-4)
 
   def _test_gradient_against_numpy(self, dtype, random, use_gpu):
-    """check sparsemax-loss Rop, aginst numpy Rop"""
+    """check sparsemax-loss Rop, against numpy Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10))
     q = np.zeros((test_obs, 10))
     q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
index eafac1b9ae778ece44e36722cd85d28ed0b0c8d5..82d36ee9cb21fb822e6df0c3632c49a4fd616825 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
@@ -188,7 +188,7 @@ class SparsemaxTest(test.TestCase):
     self.assertShapeEqual(z, tf_sparsemax_op)
 
   def _test_gradient_against_estimate(self, dtype, random, use_gpu):
-    """check sparsemax Rop, aginst estimated Rop"""
+    """check sparsemax Rop, against estimated Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
 
     logits = array_ops.placeholder(dtype, name='z')
@@ -204,7 +204,7 @@ class SparsemaxTest(test.TestCase):
     self.assertLess(err, 1e-4)
 
   def _test_gradient_against_numpy(self, dtype, random, use_gpu):
-    """check sparsemax Rop, aginst numpy Rop"""
+    """check sparsemax Rop, against numpy Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
 
     logits = constant_op.constant(z, name='z')
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index c99f9b7c12923a3d034cb013af2c11a1375012c4..17269863542a38724d6fc9d7f9958aa563370ea9 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -27,8 +27,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 
-INFERENCE_PROB_NAME = prediction_key.PredictionKey.CLASSES
-INFERENCE_PRED_NAME = prediction_key.PredictionKey.PROBABILITIES
+INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
+INFERENCE_PRED_NAME = prediction_key.PredictionKey.CLASSES
 
 FEATURE_IMPORTANCE_NAME = 'global_feature_importance'
 
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 01697df086dacd486483bc807dccc5e37d1f6a47..0da1f78755456e01397e6fdaca46d9ecf43f1eed 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -19,8 +19,10 @@ from __future__ import print_function
 
 from tensorflow.contrib import framework as contrib_framework
 
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 
 from tensorflow.contrib.tensor_forest.client import eval_metrics
 from tensorflow.contrib.tensor_forest.python import tensor_forest
@@ -79,7 +81,7 @@ class TensorForestLossHook(session_run_hook.SessionRunHook):
     current_loss = run_values.results['current_loss']
     current_step = run_values.results['global_step']
     self.steps += 1
-    # Gaurd against the global step going backwards, which might happen
+    # Guard against the global step going backwards, which might happen
     # if we recover from something.
     if self.last_step == -1 or self.last_step > current_step:
       logging.info('TensorForestLossHook resetting last_step.')
@@ -117,6 +119,7 @@ def get_model_fn(params,
                  graph_builder_class,
                  device_assigner,
                  weights_name=None,
+                 keys_name=None,
                  early_stopping_rounds=100,
                  num_trainers=1,
                  trainer_id=0,
@@ -130,6 +133,10 @@ def get_model_fn(params,
     if weights_name and weights_name in features:
       weights = features.pop(weights_name)
 
+    keys = None
+    if keys_name and keys_name in features:
+      keys = features.pop(keys_name)
+
     # If we're doing eval, optionally ignore device_assigner.
     # Also ignore device assigner if we're exporting (mode == INFER)
     dev_assn = device_assigner
@@ -140,19 +147,36 @@ def get_model_fn(params,
     graph_builder = graph_builder_class(params,
                                         device_assigner=dev_assn)
     inference = {}
+    output_alternatives = None
     if (mode == model_fn_lib.ModeKeys.EVAL or
         mode == model_fn_lib.ModeKeys.INFER):
       inference[eval_metrics.INFERENCE_PROB_NAME] = (
           graph_builder.inference_graph(features))
 
-      if not params.regression:
+      if params.regression:
+        predictions = {
+            None: inference[eval_metrics.INFERENCE_PROB_NAME]}
+        output_alternatives = {
+            None: (constants.ProblemType.LINEAR_REGRESSION, predictions)}
+      else:
         inference[eval_metrics.INFERENCE_PRED_NAME] = math_ops.argmax(
             inference[eval_metrics.INFERENCE_PROB_NAME], 1)
 
+        predictions = {
+            prediction_key.PredictionKey.PROBABILITIES:
+                inference[eval_metrics.INFERENCE_PROB_NAME],
+            prediction_key.PredictionKey.CLASSES:
+                inference[eval_metrics.INFERENCE_PRED_NAME]}
+        output_alternatives = {
+            None: (constants.ProblemType.CLASSIFICATION, predictions)}
+
       if report_feature_importances:
         inference[eval_metrics.FEATURE_IMPORTANCE_NAME] = (
             graph_builder.feature_importances())
 
+      if keys is not None:
+        inference[keys_name] = keys
+
     # labels might be None if we're doing prediction (which brings up the
     # question of why we force everything to adhere to a single model_fn).
     loss_deps = []
@@ -197,7 +221,8 @@ def get_model_fn(params,
         loss=training_loss,
         train_op=training_graph,
         training_hooks=training_hooks,
-        scaffold=scaffold)
+        scaffold=scaffold,
+        output_alternatives=output_alternatives)
 
   return _model_fn
 
@@ -236,7 +261,7 @@ class TensorForestEstimator(estimator.Estimator):
 
   def __init__(self, params, device_assigner=None, model_dir=None,
                graph_builder_class=tensor_forest.RandomForestGraphs,
-               config=None, weights_name=None,
+               config=None, weights_name=None, keys_name=None,
                feature_engineering_fn=None,
                early_stopping_rounds=100,
                num_trainers=1, trainer_id=0,
@@ -260,6 +285,9 @@ class TensorForestEstimator(estimator.Estimator):
       weights_name: A string defining feature column name representing
         weights. Will be multiplied by the loss of the example. Used to
         downweight or boost examples during training.
+      keys_name: A string naming one of the features to strip out and
+        pass through into the inference/eval results dict.  Useful for
+        associating specific examples with their prediction.
       feature_engineering_fn: Feature engineering function. Takes features and
         labels which are the output of `input_fn` and returns features and
         labels which will be fed into the model.
@@ -284,6 +312,7 @@ class TensorForestEstimator(estimator.Estimator):
             graph_builder_class,
             device_assigner,
             weights_name=weights_name,
+            keys_name=keys_name,
             early_stopping_rounds=early_stopping_rounds,
             num_trainers=num_trainers,
             trainer_id=trainer_id,
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 3692b89f79d37114f1192167bc0675110c44d96e..fde0e87c9e3e0a4a87760d8b7034dd4ef4564d98 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -43,8 +43,8 @@ DataColumnTypes FindSparseFeatureSpec(
   return static_cast<DataColumnTypes>(spec.sparse(column_num).original_type());
 }
 
-void GetTwoBest(int max, std::function<float(int)> score_fn, float* best_score,
-                int* best_index, float* second_best_score,
+void GetTwoBest(int max, const std::function<float(int)>& score_fn,
+                float* best_score, int* best_index, float* second_best_score,
                 int* second_best_index) {
   *best_index = -1;
   *second_best_index = -1;
diff --git a/tensorflow/contrib/tensor_forest/python/ops/data_ops.py b/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
index 92f7faf7ac4db56bc28fc92752ad19b1a251bc1e..2e54f620d5b306096a146a8cb9b6c0e89f317e3e 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
@@ -145,8 +145,7 @@ def ParseDataTensorOrDict(data):
           col_spec.size = shape[1].value
 
         dense_features_size += col_spec.size
-        x = array_ops.reshape(data[k], [-1, 1])
-        dense_features.append(CastToFloat(x))
+        dense_features.append(CastToFloat(data[k]))
 
     processed_dense_features = None
     processed_sparse_features = None
diff --git a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
index c6fe518bbb278c15f934c8a9b1c2ad222dbc6475..f2065c666255984c8ab770fc10f682b1eabad095 100644
--- a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
+++ b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
@@ -127,6 +127,6 @@ class FakeSummaryWriter(object):
 
   def reopen(self):
     pass
-  
+
   def close(self):
     pass
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index c7ff4a2921eb7ca834d9df8f695b3f7b6f0a69ba..5bfa0247a51c22ea8387f62d416fbf76ea4d38fb 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -2,75 +2,25 @@
 
 # Full Docment in tensorflow/tools/tfprof/README.md
 
-Author: Xin Pan (xpan@google.com, github: panyx0718)
+Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
 
 Consultants: Jon Shlens, Pete Warden
 
 ###Major Features
 
 1.  Measure model parameters, float operations, tensor shapes.
-2.  Measure op execution times, requested memory size and device placement.
+2.  Profile op execution times, requested memory size and device placement.
 3.  Inspect checkpoint tensors' shapes and their values.
-4.  Explore model based on name scope or graph structure.
-5.  Selectively grouping/filtering/accounting/ordering ops.
+4.  Selectively group, filter, account and order ops.
 
-tfprof can be used as Python API, Interactive CLI and One-shot Script.
+####tfprof supports 3 views to organize TensorFlow model profiles
 
-## Python API Tutorials
+    *  code view: Stats are associated your Python codes and organized as call stacks.
+    *  scope view: Stats are organized as name scope hierarchies.
+    *  graph view: Stats are organized as Tensorflow Op graph.
 
-tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
+####For each view, there are 3 ways to display outputs:
 
-### Examine the shapes and sizes of all trainiable Variables.
-```python
-# Print trainable variable parameter statistics to stdout.
-param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.
-        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
-
-# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
-# of each graph node in tree scructure. Let's print the root below.
-sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
-```
-
-### Examine the number of floating point operations
-``` python
-# Print to stdout an analysis of the number of floating point operations in the
-# model broken down by individual operations.
-#
-# Note: Only Ops with RegisterStatistics('flops') defined have flop stats. It
-# also requires complete shape information. It is common that shape is unknown
-# statically. To complete the shape, provide run-time shape information with
-# tf.RunMetadata to the API (See next example on how to provide RunMetadata).
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
-```
-
-### Examine the timing and memory usage
-You will first need to run the following set up in your model in order to
-compute the memory and timing statistics.
-
-```python
-# Generate the meta information for the model that contains the memory usage
-# and timing information.
-run_metadata = tf.RunMetadata()
-with tf.Session() as sess:
-  _ = sess.run(train_op,
-               options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
-               run_metadata=run_metadata)
-```
-
-Finally, you may run `print_model_analysis` to explore the timing and memory
-demands of the model.
-
-``` python
-# Print to stdout an analysis of the memory usage and the timing information
-# from running the graph broken down by operations.
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    run_meta=run_metadata,
-    tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
-```
-
-Users can change ```tfprof_options``` to fully leverage tfprof's power.
+    *  stdout: Results are written to stdout.
+    *  timeline: Visualized in chrome browser as time series.
+    *  file: Results are dumped to file.
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
index 9c3b10b22c554e76b856d32cd72bbf4681542227..c96f6719e7ed4db2cf24d2600bf2134a6529bcd2 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
@@ -21,16 +21,34 @@ py_test(
     name = "model_analyzer_test",
     srcs = ["model_analyzer_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":model_analyzer",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
+        ":model_analyzer_testlib",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
+    name = "model_analyzer_testlib",
+    srcs = ["model_analyzer_testlib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":model_analyzer",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
@@ -97,6 +115,29 @@ py_test(
     ],
 )
 
+py_library(
+    name = "pprof_profiler",
+    srcs = ["pprof_profiler.py"],
+    srcs_version = "PY2AND3",
+    deps = ["@com_google_pprof//:pprof_proto_py"],
+)
+
+py_test(
+    name = "pprof_profiler_test",
+    srcs = ["pprof_profiler_test.py"],
+    main = "pprof_profiler_test.py",
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],  # TODO(annarev): get it working with pip.
+    deps = [
+        ":pprof_profiler",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@com_google_pprof//:pprof_proto_py",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
index cc94fd65b53a73113b389b5fea75ade90f00d368..17dff69edd633482325171898a016710b58d8731 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
@@ -45,7 +45,7 @@ TRAINABLE_VARS_PARAMS_STAT_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['params'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -65,7 +65,7 @@ FLOAT_OPS_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['float_ops'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -87,7 +87,7 @@ PRINT_PARAMS_ON_DEVICE = {
     'hide_name_regexes': [],
     'account_displayed_op_only': False,
     'select': ['device', 'params'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -107,7 +107,7 @@ PRINT_ALL_TIMING_MEMORY = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['micros', 'bytes'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -123,7 +123,7 @@ def print_model_analysis(graph,
   """Print model statistics.
 
     Prints the model statistics to stdout. Also returns the results
-    in a TFProfNode proto. See go/tfprof or run tfprof tool:
+    in a TFGraphNodeProto proto. See go/tfprof or run tfprof tool:
     'bazel run third_party/tensorflow/tools/tfprof help'
 
     Examples:
@@ -142,15 +142,19 @@ def print_model_analysis(graph,
               'micros' and 'bytes'.
     op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
             group together ops and use a op_type to select the group.
-    tfprof_cmd: string. Either 'scope' or 'graph'. 'scope' view organize
-                ops using their name scopes. 'graph' view organize ops using
-                their graph inputs.
+    tfprof_cmd: string. Either 'scope', 'graph', 'code'.
+                'scope' view organize outputs using ops' name scope.
+                'graph' view organize outputs using op's inputs/outputs.
+                'code' view organize outputs using Python call stack.
     tfprof_options: See 'tfprof help' for details.
   Returns:
-    TFProfNode proto. Side effect: a formatted output to stdout.
+    If tfprof_cmd is 'scope' or 'graph', returns TFGraphNodeProto proto.
+    If tfprof_cmd is 'code', returns TFCodeNodeProto proto.
+    Side effect: a formatted output to stdout.
   """
   # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(graph, op_log, run_meta)
+  op_log = tfprof_logger._merge_default_with_oplog(
+      graph, op_log, run_meta, add_trace=tfprof_cmd == 'code')
   # pylint: enable=protected-access
   opts = tfprof_options_pb2.OptionsProto()
   opts.max_depth = tfprof_options['max_depth']
@@ -174,15 +178,28 @@ def print_model_analysis(graph,
   opts.account_displayed_op_only = tfprof_options['account_displayed_op_only']
   for p in tfprof_options['select']:
     opts.select.append(p)
-  opts.viz = tfprof_options['viz']
+  opts.output = tfprof_options['output']
   opts.dump_to_file = tfprof_options['dump_to_file']
 
   run_meta_str = run_meta.SerializeToString() if run_meta else b''
-  op_log_str = op_log.SerializeToString() if op_log else b''
 
-  tfprof_node = tfprof_output_pb2.TFProfNode()
-  tfprof_node.ParseFromString(
-      print_mdl.PrintModelAnalysis(
-          graph.as_graph_def().SerializeToString(), run_meta_str, op_log_str,
-          tfprof_cmd.encode('utf-8'), opts.SerializeToString()))
+  if tfprof_cmd == 'code':
+    tfprof_node = tfprof_output_pb2.TFCodeNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def().SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            tfprof_cmd.encode('utf-8'),
+            opts.SerializeToString()))
+  else:
+    tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def().SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            tfprof_cmd.encode('utf-8'),
+            opts.SerializeToString()))
+
   return tfprof_node
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
index 66b9267cbec03568e581d1f846bc6a3f8e4ae2fb..afd8563e78d434710df85176c73c2bb938963669 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
@@ -18,52 +18,31 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 # XXX: this depends on pywrap_tensorflow and must come later
 from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
+from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer_testlib as lib
 
 
 class PrintModelAnalysisTest(test.TestCase):
 
-  def _BuildSmallModel(self):
-    image = array_ops.zeros([2, 6, 6, 3])
-    _ = variable_scope.get_variable(
-        'ScalarW', [],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    kernel = variable_scope.get_variable(
-        'DW', [3, 3, 3, 6],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
-    kernel = variable_scope.get_variable(
-        'DW2', [2, 2, 6, 12],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
-    return x
-
   def testDumpToFile(self):
+    ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
 
     with session.Session() as sess, ops.device('/cpu:0'):
-      _ = self._BuildSmallModel()
+      _ = lib.BuildSmallModel()
       model_analyzer.print_model_analysis(sess.graph, tfprof_options=opts)
 
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
+      with gfile.Open(outfile, 'r') as f:
         self.assertEqual(u'_TFProfRoot (--/451 params)\n'
                          '  DW (3x3x3x6, 162/162 params)\n'
                          '  DW2 (2x2x6x12, 288/288 params)\n'
@@ -71,15 +50,17 @@ class PrintModelAnalysisTest(test.TestCase):
                          f.read())
 
   def testSelectEverything(self):
+    ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
     opts['account_type_regexes'] = ['.*']
     opts['select'] = [
         'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device', 'op_types'
     ]
 
     with session.Session() as sess, ops.device('/cpu:0'):
-      x = self._BuildSmallModel()
+      x = lib.BuildSmallModel()
 
       sess.run(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
@@ -91,13 +72,121 @@ class PrintModelAnalysisTest(test.TestCase):
       model_analyzer.print_model_analysis(
           sess.graph, run_meta, tfprof_options=opts)
 
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
+      with gfile.Open(outfile, 'r') as f:
+        # pylint: disable=line-too-long
+        self.assertEqual(
+            '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB, _kTFScopeParent)\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, Assign)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, VariableV2|_trainable_variables)\n    ScalarW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign)\n    ScalarW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      ScalarW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    ScalarW/read (0/0 params, 0/0 flops, 0B/0B, Identity)\n  init (0/0 params, 0/0 flops, 0B/0B, NoOp)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const)\n',
+            f.read())
+        # pylint: enable=line-too-long
+
+  def testSimpleCodeView(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
+    opts['account_type_regexes'] = ['.*']
+    opts['show_name_regexes'] = ['.*model_analyzer_testlib.*']
+    opts['account_displayed_op_only'] = False
+    # TODO(xpan): Test 'micros'. Since the execution time changes each run,
+    # it's a bit difficult to test it now.
+    opts['select'] = [
+        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device',
+    ]
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
         # pylint: disable=line-too-long
         self.assertEqual(
-            '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB, _kTFScopeParent)\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|VariableV2|_trainable_variables)\n    ScalarW/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    ScalarW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      ScalarW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    ScalarW/read (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Identity)\n  init (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|NoOp)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const)\n',
+            '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB)\n  model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops, 0B/864B)\n  model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/1 params, 0/0 flops, 0B/0B)\n  model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/162 params, 0/0 flops, 0B/1.30KB)\n  model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/5.83k flops, 0B/432B)\n  model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/288 params, 0/0 flops, 0B/2.30KB)\n  model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/4.61k flops, 0B/384B)\n',
             f.read())
         # pylint: enable=line-too-long
 
+  def testComplexCodeView(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
+    opts['account_type_regexes'] = ['.*']
+    opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
+    opts['account_displayed_op_only'] = False
+    opts['select'] = ['params', 'float_ops']
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      # pylint: disable=line-too-long
+      with gfile.Open(outfile, 'r') as f:
+        self.assertEqual(
+            '_TFProfRoot (0/2.84k params, 0/54.08k flops)\n  model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_... (0/1.80k params, 0/41.76k flops)\n    model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/4 params, 0/0 flops)\n    model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/648 params, 0/0 flops)\n    model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/1.15k params, 0/0 flops)\n    model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c... (0/1.04k params, 0/4.13k flops)\n  model_analyzer_testlib.py:62:BuildFullModel:target = array_op... (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_... (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min... (0/0 params, 0/8.19k flops)\n',
+            f.read())
+
+      self.assertLess(0, tfprof_node.total_exec_micros)
+      self.assertEqual(2844, tfprof_node.total_parameters)
+      self.assertEqual(54080, tfprof_node.total_float_ops)
+      self.assertEqual(5, len(tfprof_node.children))
+      self.assertEqual('_TFProfRoot', tfprof_node.name)
+      self.assertEqual('model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_...',
+                       tfprof_node.children[0].name)
+      self.assertEqual('model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c...',
+                       tfprof_node.children[1].name)
+      self.assertEqual('model_analyzer_testlib.py:62:BuildFullModel:target = array_op...',
+                       tfprof_node.children[2].name)
+      self.assertEqual('model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_...',
+                       tfprof_node.children[3].name)
+      self.assertEqual('model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min...',
+                       tfprof_node.children[4].name)
+      # pylint: enable=line-too-long
+
+  def testCodeViewLeafGraphNode(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    opts['account_type_regexes'] = ['.*']
+    opts['account_displayed_op_only'] = False
+    opts['select'] = [
+        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device'
+    ]
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      leaf = tfprof_node
+      while leaf.children:
+        self.assertEqual(0, len(leaf.graph_nodes))
+        leaf = leaf.children[0]
+      self.assertEqual(1, len(leaf.graph_nodes))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_testlib.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_testlib.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed26f001c2e74eee5c2efca5a2356b08a94463ae
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_testlib.py
@@ -0,0 +1,67 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A test lib that defines some models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rnn.python.ops.core_rnn_cell import BasicRNNCell
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import gradient_descent
+
+
+def BuildSmallModel():
+  """Build a small forward conv model."""
+  image = array_ops.zeros([2, 6, 6, 3])
+  _ = variable_scope.get_variable(
+      'ScalarW', [],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  kernel = variable_scope.get_variable(
+      'DW', [3, 3, 3, 6],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
+  kernel = variable_scope.get_variable(
+      'DW2', [2, 2, 6, 12],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
+  return x
+
+
+def BuildFullModel():
+  """Build the full model with conv,rnn,opt."""
+  seq = []
+  for i in range(4):
+    with variable_scope.variable_scope('inp_%d' % i):
+      seq.append(array_ops.reshape(BuildSmallModel(), [2, 1, -1]))
+
+  cell = BasicRNNCell(16, 48)
+  out = rnn.dynamic_rnn(
+      cell, array_ops.concat(seq, axis=1), dtype=dtypes.float32)[0]
+
+  target = array_ops.ones_like(out)
+  loss = nn_ops.l2_loss(math_ops.reduce_mean(target - out))
+  sgd_op = gradient_descent.GradientDescentOptimizer(1e-2)
+  return sgd_op.minimize(loss)
+
+
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57e45748d2c9503d8a26c4e3e23477c28146f46
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py
@@ -0,0 +1,445 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Profiler for TensorFlow models that outputs data in pprof format.
+
+See https://github.com/google/pprof/blob/master/proto/profile.proto for pprof
+profile format.
+The following needs to be set for profiler to work:
+  * trace_level needs to be set to FULL_TRACE
+  * run_metadata object should be passed in to session.run call
+
+Sample usage:
+  options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+  run_metadata = tf.RunMetadata()
+
+  with tf.Session as sess:
+    ...
+    sess.run(computation, run_metadata=run_metadata, options=options)
+  pprof_profiler.profile(sess.graph, run_metadata, output_dir)
+
+
+  The code above would output a pprof profile to separate output_dir/.*.pb.gz
+  file for each device. These files can be passed to pprof for formatting.
+  For e.g.:
+     pprof -png --nodecount=100 --sample_index=1 output_dir/profile_output.pb.gz
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+from collections import namedtuple
+import gzip
+import os
+import string
+import sys
+import time
+
+from proto import profile_pb2
+
+
+if sys.version_info < (3,):
+  maketrans = string.maketrans
+else:
+  maketrans = str.maketrans
+
+
+ProfileDatum = namedtuple('ProfileDatum', [
+    'node_exec_stats', 'op_type', 'traceback'])
+
+
+class StringTable(object):
+  """Keeps track of strings to add to string_table in pprof proto."""
+
+  def __init__(self):
+    # Pprof requires first entry in string_table to be ''.
+    self._string_table = ['']
+    self._string_to_index = {'': 0}
+
+  def index_of(self, value_str):
+    """Get index of value_str in the string table.
+
+    If value_str is not in the string table, we will add it at the end
+    and then return the new index.
+    Args:
+      value_str: (string) Value to lookup/add in/to the string table.
+
+    Returns:
+      Index of value_str in the string table.
+    """
+    if value_str is None:
+      value_str = ''
+    if value_str in self._string_to_index:
+      return self._string_to_index[value_str]
+    index = len(self._string_table)
+    self._string_table.append(value_str)
+    self._string_to_index[value_str] = index
+    return index
+
+  def next_index(self):
+    """Gets index that would be assigned to the next added string.
+
+    Returns:
+      Index of the next string if it was added.
+    """
+    return len(self._string_table)
+
+  def string_table(self):
+    """Returns a list of strings to store in pprof's string_table."""
+    return self._string_table
+
+
+class Functions(object):
+  """Keeps track of `Function` protos for pprof profile."""
+
+  def __init__(self, string_table):
+    """Constructor.
+
+    Args:
+      string_table: A `StringTable` object.
+    """
+    self._string_table = string_table
+    # Maps tuples in the form (file_path, function_name, start_line_number)
+    # to `Function` protos.
+    self._function_key_to_function = {}
+
+  def index_of(self, file_path, function_name, function_start_line):
+    """Returns index of the function, adding the function if needed.
+
+    Args:
+      file_path: (string) Path to file where the function is defined.
+      function_name: (string) Function name.
+      function_start_line: (integer) Start line number of function definition.
+
+    Returns:
+      Function index.
+    """
+    function_key = (file_path, function_name, function_start_line)
+    if function_key in self._function_key_to_function:
+      return self._function_key_to_function[function_key].id
+    else:
+      # Function indexes should start from 1
+      function_index = len(self._function_key_to_function) + 1
+      function = profile_pb2.Function()
+      function.id = function_index
+      function.name = self._string_table.index_of(function_name)
+      function.filename = self._string_table.index_of(file_path)
+      function.start_line = function_start_line
+      self._function_key_to_function[function_key] = function
+      return function_index
+
+  def function_protos(self):
+    """Returns list of `profile_pb2.Function` protos."""
+    return self._function_key_to_function.values()
+
+
+class Locations(object):
+  """Keeps track of `Location` protos for pprof profile.
+
+  `Locations` store information about function call locations.
+  """
+
+  def __init__(self, functions):
+    """Constructor.
+
+    Args:
+      functions: A `Functions` object.
+    """
+    self._functions = functions
+    # Maps tuples in the form (file_path, called_function_name, line_number)
+    # to `Location` protos.
+    self._location_key_to_location = {}
+
+  def index_of(
+      self, file_path, line_number, called_function_name, called_file_path,
+      called_function_start_line):
+    """Returns index of the location, adding the location if needed.
+
+    Args:
+      file_path: (string) Path to file that makes the call.
+      line_number: (integer) Call line number.
+      called_function_name: (string) Function name of the function called at
+        `file_path` and `line_number`.
+      called_file_path: (string) Path to file where the called function is
+        defined.
+      called_function_start_line: (integer) Start line number of called
+        function definition in `called_file_path` file.
+
+    Returns:
+      Index of location.
+    """
+    location_key = (file_path, called_function_name, line_number)
+    if location_key in self._location_key_to_location:
+      location = self._location_key_to_location[location_key]
+      return location.id
+    else:
+      # Location indexes should start from 1
+      location_index = len(self._location_key_to_location) + 1
+      location = profile_pb2.Location()
+      location.id = location_index
+      self._location_key_to_location[location_key] = location
+
+      line = location.line.add()
+      line.function_id = self._functions.index_of(
+          called_file_path, called_function_name, called_function_start_line)
+      line.line = line_number
+      return location_index
+
+  def location_protos(self):
+    """Returns list of `profile_pb2.Location` protos."""
+    return self._location_key_to_location.values()
+
+
+class Samples(object):
+  """Keeps track of `Sample` protos for pprof profile.
+
+  Samples store the following statistics in order:
+  count, all_time, op_time
+  """
+
+  def __init__(self, string_table):
+    """Constructor.
+
+    Args:
+      string_table: A `StringTable` object.
+    """
+    self._string_table = string_table
+    # TODO(annarev): figure out if location is unique for each node name.
+    # If not, also key this dictionary based on location ids.
+    self._node_name_to_sample = {}
+
+  def add(self, datum, location_ids):
+    """Adds a sample data point.
+
+    Args:
+      datum: `ProfileDatum` to add a sample for.
+      location_ids: List of numberic location ids for this
+        sample.
+    """
+    node_name = datum.node_exec_stats.node_name
+    if node_name in self._node_name_to_sample:
+      sample = self._node_name_to_sample[node_name]
+      sample.location_id.extend(location_ids)
+    else:
+      sample = profile_pb2.Sample()
+      # Sample stores 3 values: count, all_time, op_time
+      sample.value.extend([0, 0, 0])
+
+      label = sample.label.add()
+      label.key = self._string_table.index_of('node_name')
+      label.str = self._string_table.index_of(node_name)
+      label = sample.label.add()
+      label.key = self._string_table.index_of('op_type')
+      label.str = self._string_table.index_of(datum.op_type)
+      self._node_name_to_sample[node_name] = sample
+    sample.value[0] += 1
+    sample.value[1] += datum.node_exec_stats.all_end_rel_micros
+    sample.value[2] += (
+        datum.node_exec_stats.op_end_rel_micros -
+        datum.node_exec_stats.op_start_rel_micros)
+
+  def get_sample_protos(self):
+    """Returns list of `Sample` protos for pprof profile."""
+    return self._node_name_to_sample.values()
+
+
+class PprofProfiler(object):
+  """Creates profiles in pprof format."""
+
+  def __init__(self, graph, run_metadata):
+    """Constructor.
+
+    Args:
+      graph: A `Graph` instance.
+      run_metadata: A list of `RunMetadata` objects.
+    """
+    self._graph = graph
+    self._run_metadata = run_metadata
+    self._string_table = StringTable()
+    self._functions = Functions(self._string_table)
+    self._locations = Locations(self._functions)
+
+  def profile(self):
+    """Generates pprof profiles.
+
+    Returns:
+      Dictionary mapping from device name to proto in `profile_pb2.Profile`
+      format.
+    """
+    profiles = {}
+    data_generator_func = self._get_profile_data_generator()
+    for device_index, device_stats in enumerate(
+        self._run_metadata.step_stats.dev_stats):
+      # Create profile
+      pprof_proto = self._get_pprof_proto(data_generator_func(device_stats))
+      if not pprof_proto.sample:
+        print(
+            'Not enough data to create profile for device %s. Did you pass '
+            'RunMetadata to session.run call?' % device_stats.device)
+        continue
+      # Add device name comment
+      device_count = len(self._run_metadata.step_stats.dev_stats)
+      device_description = (
+          'Device %d of %d: %s' %
+          (device_index + 1, device_count, device_stats.device))
+      device_description_str_index = self._string_table.next_index()
+      pprof_proto.string_table.append(device_description)
+      pprof_proto.comment.append(device_description_str_index)
+      profiles[device_stats.device] = pprof_proto
+    return profiles
+
+  def _get_pprof_proto(self, profile_datum_generator):
+    """Returns profile data in pprof proto format.
+
+    Args:
+      profile_datum_generator: Generator outputting `ProfileDatum` objects.
+
+    Returns:
+      A proto in pprof format.
+    """
+    pprof_profile = profile_pb2.Profile()
+    samples = Samples(self._string_table)
+
+    for datum in profile_datum_generator:
+      if not datum.traceback:
+        continue
+
+      stack_frame = datum.traceback[-1]
+      after_apply_op = False
+      location_ids = []
+
+      # We add locations from stack trace in bottom-up order.
+      for stack_frame_index in reversed(range(len(datum.traceback) - 1)):
+        prev_stack_frame = stack_frame
+        stack_frame = datum.traceback[stack_frame_index]
+
+        # Call at current frame calls function at previous frame.
+        prev_file_path = prev_stack_frame[0]
+        prev_function = prev_stack_frame[2]
+        prev_function_start_line = prev_stack_frame[4]
+        curr_file_path = stack_frame[0]
+        curr_line_number = stack_frame[1]
+
+        # Skip all calls up to apply_op since they are the same for all ops.
+        if not after_apply_op:
+          if prev_function == 'apply_op':
+            after_apply_op = True
+          continue
+        location_index = self._locations.index_of(
+            curr_file_path, curr_line_number,
+            prev_function, prev_file_path, prev_function_start_line)
+        location_ids.append(location_index)
+      samples.add(datum, location_ids)
+
+    sample_type_description = 'count'
+    sample_type = pprof_profile.sample_type.add()
+    sample_type.type = self._string_table.index_of(sample_type_description)
+    sample_type.unit = self._string_table.index_of('count')
+    sample_type_description = 'all_time'
+    sample_type = pprof_profile.sample_type.add()
+    sample_type.type = self._string_table.index_of(sample_type_description)
+    sample_type.unit = self._string_table.index_of('nanoseconds')
+    sample_type_description = 'op_time'
+    sample_type = pprof_profile.sample_type.add()
+    sample_type.type = self._string_table.index_of(sample_type_description)
+    sample_type.unit = self._string_table.index_of('nanoseconds')
+
+    pprof_profile.string_table.extend(self._string_table.string_table())
+    pprof_profile.sample.extend(samples.get_sample_protos())
+    pprof_profile.function.extend(self._functions.function_protos())
+    pprof_profile.location.extend(self._locations.location_protos())
+    return pprof_profile
+
+  def _get_profile_data_generator(self):
+    """Get function that generates `ProfileDatum` objects.
+
+    Returns:
+      A function that generates `ProfileDatum` objects.
+    """
+    node_to_traceback = defaultdict(list)
+    node_to_op_type = defaultdict(str)
+    for op in self._graph.get_operations():
+      node_to_traceback[op.name] = op.traceback_with_start_lines
+      node_to_op_type[op.name] = op.type
+
+    def profile_data_generator(device_step_stats):
+      for node_stats in device_step_stats.node_stats:
+        if node_stats.node_name == '_SOURCE' or node_stats.node_name == '_SINK':
+          continue
+        yield ProfileDatum(
+            node_stats,
+            node_to_op_type[node_stats.node_name],
+            node_to_traceback[node_stats.node_name])
+
+    return profile_data_generator
+
+
+def get_profiles(graph, run_metadata):
+  """Generate profiles in pprof format.
+
+  See https://github.com/google/pprof/blob/master/proto/profile.proto
+  for pprof proto format.
+
+  Args:
+    graph: A `Graph` object.
+    run_metadata: A `RunMetadata` proto.
+
+  Returns:
+    A dictionary mapping from device name to pprof proto for that device.
+  """
+  return PprofProfiler(graph, run_metadata).profile()
+
+
+def profile(graph, run_metadata, output_dir=None):
+  """Generate profiles in pprof format.
+
+  See https://github.com/google/pprof/blob/master/proto/profile.proto
+  for pprof proto format.
+
+  Args:
+    graph: A `Graph` object.
+    run_metadata: A `RunMetadata` proto.
+    output_dir: (string) Directory to output pprof profile to.
+      Profile files for each device will be stored in compressed
+      serialized proto format. If output_dir is None, profile protos
+      will be printed to stdout instead.
+
+  Returns:
+    List of output files created by this profile call.
+    (Note: this list will be empty if output_dir is None)
+  """
+  profiles = get_profiles(graph, run_metadata)
+  output_file_template = None
+  if output_dir:
+    if not os.path.isdir(output_dir):
+      os.makedirs(output_dir)
+    time_suffix = time.strftime('%Y%m%d%H%M%S')
+    output_file_template = os.path.join(
+        output_dir, '%s_' + time_suffix + '.pb.gz')
+
+  profile_files = []
+  for device, pprof_proto in profiles.items():
+    if output_file_template is None:
+      print('No output directory specified, printing to stdout instead.')
+      print(pprof_proto)
+    else:
+      device_name = str(device).strip('/').translate(
+          maketrans('/:', '__'))
+      profile_file = output_file_template % device_name
+      profile_files.append(profile_file)
+      with gzip.open(profile_file, 'w') as output_file:
+        print('Writing profile to %s...' % profile_file)
+        output_file.write(pprof_proto.SerializeToString())
+  return profile_files
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6487adf99204d7d2f22f47e937a6921c2a54e220
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py
@@ -0,0 +1,164 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for pprof_profiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+
+from proto import profile_pb2
+from tensorflow.contrib.tfprof.python.tools.tfprof import pprof_profiler
+from tensorflow.core.framework import step_stats_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class PprofProfilerTest(test.TestCase):
+
+  def testDataEmpty(self):
+    output_dir = test.get_temp_dir()
+    run_metadata = config_pb2.RunMetadata()
+    graph = test.mock.MagicMock()
+    graph.get_operations.return_value = []
+
+    profiles = pprof_profiler.get_profiles(graph, run_metadata)
+    self.assertEquals(0, len(profiles))
+    profile_files = pprof_profiler.profile(
+        graph, run_metadata, output_dir)
+    self.assertEquals(0, len(profile_files))
+
+  def testRunMetadataEmpty(self):
+    output_dir = test.get_temp_dir()
+    run_metadata = config_pb2.RunMetadata()
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = 'Add/123'
+    op1.traceback = [('a/b/file1', 10, 'some_var')]
+    op1.type = 'add'
+    graph.get_operations.return_value = [op1]
+
+    profiles = pprof_profiler.get_profiles(graph, run_metadata)
+    self.assertEquals(0, len(profiles))
+    profile_files = pprof_profiler.profile(
+        graph, run_metadata, output_dir)
+    self.assertEquals(0, len(profile_files))
+
+  def testValidProfile(self):
+    output_dir = test.get_temp_dir()
+    run_metadata = config_pb2.RunMetadata()
+
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name='Add/123',
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = 'deviceA'
+    device1.node_stats.extend([node1])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = 'Add/123'
+    op1.traceback = [
+        ('a/b/file1', 10, 'apply_op', 'abc'), ('a/c/file2', 12, 'my_op', 'def')]
+    op1.type = 'add'
+    graph.get_operations.return_value = [op1]
+
+    expected_proto = """sample_type {
+  type: 5
+  unit: 5
+}
+sample_type {
+  type: 6
+  unit: 7
+}
+sample_type {
+  type: 8
+  unit: 7
+}
+sample {
+  value: 1
+  value: 4
+  value: 2
+  label {
+    key: 1
+    str: 2
+  }
+  label {
+    key: 3
+    str: 4
+  }
+}
+string_table: ""
+string_table: "node_name"
+string_table: "Add/123"
+string_table: "op_type"
+string_table: "add"
+string_table: "count"
+string_table: "all_time"
+string_table: "nanoseconds"
+string_table: "op_time"
+string_table: "Device 1 of 1: deviceA"
+comment: 9
+"""
+    # Test with protos
+    profiles = pprof_profiler.get_profiles(graph, run_metadata)
+    self.assertEquals(1, len(profiles))
+    self.assertTrue('deviceA' in profiles)
+    self.assertEquals(expected_proto, str(profiles['deviceA']))
+    # Test with files
+    profile_files = pprof_profiler.profile(
+        graph, run_metadata, output_dir)
+    self.assertEquals(1, len(profile_files))
+    with gzip.open(profile_files[0]) as profile_file:
+      profile_contents = profile_file.read()
+      profile = profile_pb2.Profile()
+      profile.ParseFromString(profile_contents)
+      self.assertEquals(expected_proto, str(profile))
+
+  def testProfileWithWhileLoop(self):
+    options = config_pb2.RunOptions()
+    options.trace_level = config_pb2.RunOptions.FULL_TRACE
+    run_metadata = config_pb2.RunMetadata()
+
+    num_iters = 5
+    with self.test_session() as sess:
+      i = constant_op.constant(0)
+      c = lambda i: math_ops.less(i, num_iters)
+      b = lambda i: math_ops.add(i, 1)
+      r = control_flow_ops.while_loop(c, b, [i])
+      sess.run(r, options=options, run_metadata=run_metadata)
+      profiles = pprof_profiler.get_profiles(sess.graph, run_metadata)
+      self.assertEquals(1, len(profiles))
+      profile = next(iter(profiles.values()))
+      add_samples = []  # Samples for the while/Add node
+      for sample in profile.sample:
+        if profile.string_table[sample.label[0].str] == 'while/Add':
+          add_samples.append(sample)
+      # Values for same nodes are aggregated.
+      self.assertEquals(1, len(add_samples))
+      # Value of "count" should be equal to number of iterations.
+      self.assertEquals(num_iters, add_samples[0].value[0])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
index f0ac36c66a11b0b985a0b91817795419990ef119..c3e9fc9cc099f144f81235a944221fa05b6b398c 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
@@ -51,7 +51,7 @@ TEST_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['params'],
-    'viz': False
+    'output': 'stdout',
 }
 
 # pylint: enable=bad-whitespace
@@ -92,16 +92,17 @@ class PrintModelAnalysisTest(test.TestCase):
     opts.account_displayed_op_only = TEST_OPTIONS['account_displayed_op_only']
     for p in TEST_OPTIONS['select']:
       opts.select.append(p)
-    opts.viz = TEST_OPTIONS['viz']
+    opts.output = TEST_OPTIONS['output']
 
     with session.Session() as sess, ops.device('/cpu:0'):
       _ = self._BuildSmallModel()
-      tfprof_pb = tfprof_output_pb2.TFProfNode()
+      tfprof_pb = tfprof_output_pb2.TFGraphNodeProto()
       tfprof_pb.ParseFromString(
-          print_mdl.PrintModelAnalysis(sess.graph.as_graph_def(
-          ).SerializeToString(), b'', b'', b'scope', opts.SerializeToString()))
+          print_mdl.PrintModelAnalysis(
+              sess.graph.as_graph_def().SerializeToString(),
+              b'', b'', b'scope', opts.SerializeToString()))
 
-      expected_pb = tfprof_output_pb2.TFProfNode()
+      expected_pb = tfprof_output_pb2.TFGraphNodeProto()
       text_format.Merge(r"""name: "_TFProfRoot"
       exec_micros: 0
       requested_bytes: 0
@@ -115,7 +116,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
@@ -127,7 +127,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 648
-      device: "/device:CPU:0"
       children {
       name: "DW/Assign"
       exec_micros: 0
@@ -135,7 +134,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
@@ -216,7 +214,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
@@ -230,7 +227,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
index e8cf84b6c7703078c88bf369aa6f5aedae68243a..e6d504d5165d4608033f2de7ef386e662912e451 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
@@ -62,12 +62,13 @@ def _fill_missing_graph_shape(graph, run_meta):
   return graph
 
 
-def _get_logged_ops(graph, run_meta=None):
+def _get_logged_ops(graph, run_meta=None, add_trace=True):
   """Extract trainable model parameters and FLOPs for ops from a Graph.
 
   Args:
     graph: tf.Graph.
     run_meta: RunMetadata proto used to complete shape information.
+    add_trace: Whether to add op trace information.
   Returns:
     logged_ops: dict mapping from op_name to OpLogEntry.
   """
@@ -76,21 +77,32 @@ def _get_logged_ops(graph, run_meta=None):
 
   op_missing_shape = 0
   logged_ops = {}
-  graph_def = graph.as_graph_def()
-  for node in graph_def.node:
+  for op in graph.get_operations():
     try:
-      stats = ops.get_stats_for_node_def(graph, node, REGISTERED_FLOP_STATS)
+      stats = ops.get_stats_for_node_def(
+          graph, op.node_def, REGISTERED_FLOP_STATS)
     except ValueError:
       # Catch Exception When shape is incomplete. Skip it.
       op_missing_shape += 1
       stats = None
 
-    if not stats or not stats.value:
-      continue
-    if node.name not in logged_ops:
-      entry = tfprof_log_pb2.OpLogEntry()
-      entry.name = node.name
+    entry = tfprof_log_pb2.OpLogEntry()
+    entry.name = op.name
+    add_entry = False
+    if stats and stats.value:
       entry.float_ops = int(stats.value)
+      add_entry = True
+
+    if add_trace:
+      for tb in op.traceback:
+        trace = entry.code_def.traces.add()
+        trace.file = tb[0] if tb[0] else 'none'
+        trace.lineno = tb[1] if tb[1] else -1
+        trace.function = tb[2] if tb[2] else 'none'
+        trace.line = tb[3] if tb[3] else 'none'
+      add_entry = True
+
+    if add_entry:
       logged_ops[entry.name] = entry
 
   for v in graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES):
@@ -108,18 +120,20 @@ def _get_logged_ops(graph, run_meta=None):
   return logged_ops
 
 
-def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
+def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
+                              add_trace=True):
   """Merge the tfprof default extra info with caller's op_log.
 
   Args:
     graph: tf.Graph.
     op_log: OpLog proto.
     run_meta: RunMetadata proto used to complete shape information.
+    add_trace: Whether to add op trace information.
   Returns:
     tmp_op_log: Merged OpLog proto.
   """
   tmp_op_log = tfprof_log_pb2.OpLog()
-  logged_ops = _get_logged_ops(graph, run_meta)
+  logged_ops = _get_logged_ops(graph, run_meta, add_trace=add_trace)
   if not op_log:
     tmp_op_log.log_entries.extend(logged_ops.values())
   else:
@@ -131,13 +145,15 @@ def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
         all_ops[op_name].types.extend(entry.types)
         if entry.float_ops > 0 and all_ops[op_name].float_ops == 0:
           all_ops[op_name].float_ops = entry.float_ops
+        if entry.code_def.traces and not all_ops[op_name].code_def.traces:
+          all_ops[op_name].code_def.MergeFrom(entry.code_def)
       else:
         all_ops[op_name] = entry
     tmp_op_log.log_entries.extend(all_ops.values())
   return tmp_op_log
 
 
-def write_op_log(graph, log_dir, op_log=None, run_meta=None):
+def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
     The API also assigns ops in tf.trainable_variables() an op type called
@@ -154,8 +170,9 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None):
         one is created.
     run_meta: (Optional) RunMetadata proto that helps flops computation using
         run time shape information.
+    add_trace: Whether to add op trace information. Used to support "code" view.
   """
-  op_log = _merge_default_with_oplog(graph, op_log, run_meta)
+  op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)
 
   with gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
     log.write(op_log.SerializeToString())
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index f4effaf6efeeac3c50084463ddca0443ec4fbc51..bc0c60c85ce029dcd0bc109e4f10d36b2e2c374e 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -254,7 +254,7 @@ def checkpoints_iterator(checkpoint_dir,
         logging.info('Timed-out waiting for a checkpoint.')
         return
       if timeout_fn():
-        # The timeout_fn indicated that we are truely done.
+        # The timeout_fn indicated that we are truly done.
         return
       else:
         # The timeout_fn indicated that more checkpoints may come.
@@ -290,17 +290,17 @@ class SummaryAtEndHook(session_run_hook.SessionRunHook):
       ValueError: If both `log_dir` and `summary_writer` are `None`.
     """
     self._summary_op = summary_op
+    self._replace_summary_op = summary_op is None
     self._feed_dict = feed_dict
     self._summary_writer = summary_writer
     self._log_dir = log_dir
-    self._summary_writer = summary_writer
     if self._log_dir is None and self._summary_writer is None:
       raise ValueError('One of log_dir or summary_writer should be used.')
-    self._global_step = variables.get_or_create_global_step()
 
   def begin(self):
-    if self._summary_op is None:
+    if self._replace_summary_op:
       self._summary_op = summary.merge_all()
+    self._global_step = variables.get_or_create_global_step()
 
   def after_create_session(self, session, coord):
     if self._summary_writer is None and self._log_dir:
diff --git a/tensorflow/contrib/training/python/training/feeder.py b/tensorflow/contrib/training/python/training/feeder.py
index a7f43cc07e9e48748c0aef46f31639f28382d8f0..a5cd7c5c947efff9154f9752d9bcf01e38a382a2 100644
--- a/tensorflow/contrib/training/python/training/feeder.py
+++ b/tensorflow/contrib/training/python/training/feeder.py
@@ -18,7 +18,7 @@
 
 This helper handles the plumbing in order to set up a feeder task to
 push generated inputs to a pool of remote consumers; or to run an
-identical feeding mechanism in a seperate thread in the same process.
+identical feeding mechanism in a separate thread in the same process.
 
 Example usage for distributed feeding:
 
@@ -331,7 +331,7 @@ class Feeder(object):
     they never close their queue. Second, they are added to the
     `Feeder.REMOTE_QUEUE_RUNNERS` collection, rather than
     `ops.GraphKeys.QUEUE_RUNNERS`, so they can be started/stopped
-    seperately.
+    separately.
 
     Args:
       queue: The queue.
diff --git a/tensorflow/contrib/training/python/training/feeder_test.py b/tensorflow/contrib/training/python/training/feeder_test.py
index 4d5cf9eff26041a26cc6dbb2ee02692e281df021..f3a2fee0463f25a18418e01c6240196326ef4965 100644
--- a/tensorflow/contrib/training/python/training/feeder_test.py
+++ b/tensorflow/contrib/training/python/training/feeder_test.py
@@ -156,7 +156,7 @@ class FeederTest(test.TestCase):
     coord.join()
 
   def testFeederSeparateThread(self):
-    # Start a feeder on a seperate thread, but with a shared local queue
+    # Start a feeder on a separate thread, but with a shared local queue
     servers = self._create_local_cluster(worker=1)
     coord = coordinator.Coordinator()
     feed_thread = FeederThread(self, coord, servers, 'worker', 0)
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 987d2a0b70b6453a6a29f1904d13177e210e083e..2e08593699783e6544791fd21dedb2759a2ff2ac 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -84,9 +84,12 @@ def parse_values(values, type_map):
     name = m_dict['name']
     if name not in type_map:
       raise ValueError('Unknown hyperparameter type for %s' % name)
-    def parse_fail():
-      raise ValueError('Could not parse hparam %s in %s' % (name, values))
-    if type_map[name] == bool:
+    type_ = type_map[name]
+    def parse_fail(value):
+      raise ValueError(
+          'Could not parse hparam \'%s\' of type \'%s\' with value \'%s\' in %s'
+          % (name, type_.__name__, value, values))
+    if type_ == bool:
       def parse_bool(value):
         if value == 'true':
           return True
@@ -95,24 +98,24 @@ def parse_values(values, type_map):
         else:
           try:
             return bool(int(value))
-          except ValueError:
-            parse_fail()
+          except (ValueError, TypeError):
+            parse_fail(value)
       parse = parse_bool
     else:
-      parse = type_map[name]
+      parse = type_
     if m_dict['val'] is not None:
       try:
         ret[name] = parse(m_dict['val'])
-      except ValueError:
-        parse_fail()
+      except (ValueError, TypeError):
+        parse_fail(m_dict['val'])
     elif m_dict['vals'] is not None:
       elements = filter(None, re.split('[ ,]', m_dict['vals']))
       try:
         ret[name] = [parse(e) for e in elements]
-      except ValueError:
-        parse_fail()
+      except (ValueError, TypeError):
+        parse_fail(m_dict['vals'])
     else:
-      parse_fail()
+      parse_fail('')
   return ret
 
 
@@ -161,7 +164,7 @@ class HParams(object):
   import argparse
   parser = argparse.ArgumentParser(description='Train my model.')
   parser.add_argument('--hparams', type=str,
-                      help='Comma seperated list of "name=value" pairs.')
+                      help='Comma separated list of "name=value" pairs.')
   args = parser.parse_args()
   ...
   def my_program():
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 7b85f998ea8e233a4ed24c60f4ccf41b1b9adf6d..2c7c30911c72d2395f1ea9d1ac6e258862cac943 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -1443,6 +1443,7 @@ def batch_sequences_with_states(input_key,
       input_length = input_length if input_length is not None else length
     elif input_sequences:
       # Assert that value_length is a multiple of num_unroll.
+      checked_input_sequences = {}
       for key, value in input_sequences.items():
         if (isinstance(value, sparse_tensor.SparseTensor) or
             isinstance(value, sparse_tensor.SparseTensorValue)):
@@ -1460,11 +1461,13 @@ def batch_sequences_with_states(input_key,
                           ", but saw value: ",
                           string_ops.as_string(value_length),
                           ". Consider setting pad=True."])])]):
-            input_sequences[key] = sparse_tensor.SparseTensor(
-                indices=value.indices,
+            checked_input_sequences[key] = sparse_tensor.SparseTensor(
+                indices=array_ops.identity(
+                    value.indices, name="multiple_of_checked"),
                 values=array_ops.identity(
                     value.values, name="multiple_of_checked"),
-                dense_shape=value.dense_shape)
+                dense_shape=array_ops.identity(
+                    value.dense_shape, name="multiple_of_checked"))
         else:
           if not isinstance(value, ops.Tensor):
             try:
@@ -1490,9 +1493,9 @@ def batch_sequences_with_states(input_key,
                       ])
                   ])
           ]):
-            input_sequences[key] = array_ops.identity(
+            checked_input_sequences[key] = array_ops.identity(
                 value, name="multiple_of_checked")
-
+      input_sequences = checked_input_sequences
     # Move SparseTensors in context into input_sequences.
     _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll)
     # Deconstruct SparseTensors in sequence into a dense Tensor before inputting
@@ -1691,7 +1694,6 @@ def _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll):
     ind = array_ops.expand_dims(ind, 1)
     ind = array_ops.expand_dims(ind, 2)
     ind = array_ops.tile(ind, [1, dim0, 1])
-    array_ops.reshape(ind, array_ops.stack([n, dim0, 1]))
 
     # Concatenate both and reshape.
     indices = array_ops.concat([ind, multiplied_indices], 2)
diff --git a/tensorflow/contrib/training/python/training/training_test.py b/tensorflow/contrib/training/python/training/training_test.py
index e7c8fcd2a09e8579da0cc8e15db75c14394c18a2..0af79cf2e3613eabfa64991ee94809974d777c33 100644
--- a/tensorflow/contrib/training/python/training/training_test.py
+++ b/tensorflow/contrib/training/python/training/training_test.py
@@ -508,7 +508,7 @@ class TrainTest(test.TestCase):
         # Initialize the variables.
         session.run(variables_lib2.global_variables_initializer())
 
-        # Get the intial weights and biases values.
+        # Get the initial weights and biases values.
         weights_values, biases_values = session.run([weights, biases])
         self.assertGreater(np.linalg.norm(weights_values), 0)
         self.assertAlmostEqual(np.linalg.norm(biases_values), 0)
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.cc b/tensorflow/contrib/verbs/grpc_verbs_service.cc
index e73b2700bd9ed180cc9989d8d7f8756a90bc12a9..f2af6b79fba6a480afbfe88fcbefcbf8a6670ce6 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.cc
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.cc
@@ -117,6 +117,8 @@ Status GrpcVerbsService::GetRemoteAddressSync(
   ra.lid = request->channel().lid();
   ra.qpn = request->channel().qpn();
   ra.psn = request->channel().psn();
+  ra.snp = request->channel().snp();
+  ra.iid = request->channel().iid();
   rc->SetRemoteAddress(ra, false);
   rc->Connect();
   int i = 0;
@@ -146,6 +148,8 @@ Status GrpcVerbsService::GetRemoteAddressSync(
   channel_info->set_lid(rc->self().lid);
   channel_info->set_qpn(rc->self().qpn);
   channel_info->set_psn(rc->self().psn);
+  channel_info->set_snp(rc->self().snp);
+  channel_info->set_iid(rc->self().iid);
   for (int i = 0; i < RdmaChannel::kNumMessageBuffers; i++) {
     MemoryRegion* mr = response->add_mr();
     mr->set_remote_addr(reinterpret_cast<uint64>(mb[i]->buffer()));
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index 53d840f5d1c14167cebdd8aff25f19eda2f1871b..c9ce754a371241078f72c471174e1462fc27558b 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -271,6 +271,11 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
     self_.lid = attr.lid;
     self_.qpn = qp_->qp_num;
     self_.psn = static_cast<uint32_t>(random::New64()) & 0xffffff;
+    union ibv_gid gid;
+    CHECK(!ibv_query_gid(adapter_->context_, (uint8_t)1, 0, &gid))
+        << "Query gid";
+    self_.snp = gid.global.subnet_prefix;
+    self_.iid = gid.global.interface_id;
   }
 
   // create message and ack buffers, then initialize the tables.
@@ -320,11 +325,15 @@ void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
     remote_.lid = ra.lid;
     remote_.qpn = ra.qpn;
     remote_.psn = ra.psn;
+    remote_.snp = ra.snp;
+    remote_.iid = ra.iid;
     remote_set_ = true;
   } else {
     CHECK(remote_.lid == ra.lid);
     CHECK(remote_.qpn == ra.qpn);
     CHECK(remote_.psn == ra.psn);
+    CHECK(remote_.snp == ra.snp);
+    CHECK(remote_.iid == ra.iid);
   }
 }
 
@@ -467,12 +476,20 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     struct ibv_qp_attr attr;
     memset(&attr, 0, sizeof(ibv_qp_attr));
     attr.qp_state = IBV_QPS_RTR;
-    attr.path_mtu = IBV_MTU_4096;
+    struct ibv_port_attr port_attr;
+    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &port_attr))
+        << "Query port failed";
+    // This assumes both QP's ports are configured with the same MTU
+    attr.path_mtu = port_attr.active_mtu;
     attr.dest_qp_num = remoteAddr.qpn;
     attr.rq_psn = remoteAddr.psn;
     attr.max_dest_rd_atomic = 1;
     attr.min_rnr_timer = 12;
-    attr.ah_attr.is_global = 0;
+    attr.ah_attr.is_global = 1;
+    attr.ah_attr.grh.dgid.global.subnet_prefix = remoteAddr.snp;
+    attr.ah_attr.grh.dgid.global.interface_id = remoteAddr.iid;
+    attr.ah_attr.grh.flow_label = 0;
+    attr.ah_attr.grh.hop_limit = 255;
     attr.ah_attr.dlid = remoteAddr.lid;
     attr.ah_attr.sl = 0;
     attr.ah_attr.src_path_bits = 0;
@@ -765,11 +782,8 @@ void RdmaTensorBuffer::SendNextItem() {
         EnqueueItem(key_with_step_id);
       }
     };
-    // Use default session (legacy_session_)
-    // TODO use WorkerSessionForSession
-    // need to pass in session handle
-    channel_->adapter_->worker_env_->session_mgr->LegacySession()
-        ->rendezvous_mgr->RecvLocalAsync(step_id, parsed, cb);
+    channel_->adapter_->worker_env_->rendezvous_mgr
+        ->RecvLocalAsync(step_id, parsed, cb);
   }
 }
 
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index ae2aa63e3f6c5b82c9d85d3412bf45f00248a798..10cbbe58d9a81cbb0cf287922c28219fc4b06f4f 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -40,6 +40,8 @@ struct RdmaAddress {
   uint32_t lid;
   uint32_t qpn;
   uint32_t psn;
+  uint64_t snp;
+  uint64_t iid;
 };
 // structure to save information for remote memory regions.
 struct RemoteMR {
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index e28b80c6f6bfcb4596b1cd3fb994f1ad976f01ed..09b878843f52c910f78f3769522d1fa80319c7d7 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -69,6 +69,8 @@ void RdmaMgr::SetupChannels() {
     channel_info->set_lid(rc->self_.lid);
     channel_info->set_qpn(rc->self_.qpn);
     channel_info->set_psn(rc->self_.psn);
+    channel_info->set_snp(rc->self_.snp);
+    channel_info->set_iid(rc->self_.iid);
     for (int i = 0; i < RdmaChannel::kNumMessageBuffers; i++) {
       MemoryRegion* mr = req.add_mr();
       mr->set_remote_addr(
@@ -85,6 +87,8 @@ void RdmaMgr::SetupChannels() {
       ra.lid = resp.channel().lid();
       ra.qpn = resp.channel().qpn();
       ra.psn = resp.channel().psn();
+      ra.snp = resp.channel().snp();
+      ra.iid = resp.channel().iid();
       rc->SetRemoteAddress(ra, false);
       rc->Connect();
       int i = 0;
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
index 8cbdfaa94391a4c127fcbcb620ad07400826cb2e..d665f92cd924291ab99a760287e99ec942a3b165 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -29,9 +29,9 @@ namespace tensorflow {
 
 class RdmaRemoteRendezvous : public BaseRemoteRendezvous {
  public:
-  RdmaRemoteRendezvous(const WorkerEnv* env, const string& worker_name,
+  RdmaRemoteRendezvous(const WorkerEnv* env,
                        int64 step_id, RdmaMgr* rdma_mgr)
-      : BaseRemoteRendezvous(env, worker_name, step_id, true),
+      : BaseRemoteRendezvous(env, step_id, true),
         rdma_mgr_(rdma_mgr) {}
 
  protected:
@@ -133,15 +133,12 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
   rb->SendNextItem();
 }
 
-RdmaRendezvousMgr::RdmaRendezvousMgr(const WorkerEnv* env,
-                                     const string& worker_name,
-                                     WorkerCacheInterface* worker_cache)
-    : BaseRendezvousMgr(env, worker_name) {}
+RdmaRendezvousMgr::RdmaRendezvousMgr(const WorkerEnv* env)
+    : BaseRendezvousMgr(env) {}
 
 BaseRemoteRendezvous* RdmaRendezvousMgr::Create(int64 step_id,
-                                                const WorkerEnv* worker_env,
-                                                const string& worker_name) {
-  return new RdmaRemoteRendezvous(worker_env, worker_name, step_id, rdma_mgr_);
+                                                const WorkerEnv* worker_env) {
+  return new RdmaRemoteRendezvous(worker_env, step_id, rdma_mgr_);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
index 57cd4bf5e4e725a2958720029a79b92859c5504b..2dedd6c48f96a6ecf2b69c757f525ac1bfd6f2d0 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
@@ -45,13 +45,12 @@ namespace tensorflow {
 // RendezvousMgr must have keys generated by Rendezvous::CreateKey.
 class RdmaRendezvousMgr : public BaseRendezvousMgr {
  public:
-  explicit RdmaRendezvousMgr(const WorkerEnv* env, const string& worker_name,
-                             WorkerCacheInterface* worker_cache);
+  explicit RdmaRendezvousMgr(const WorkerEnv* env);
   void SetRdmaMgr(RdmaMgr* rdma_mgr) { rdma_mgr_ = rdma_mgr; }
 
  protected:
-  BaseRemoteRendezvous* Create(int64 step_id, const WorkerEnv* worker_env,
-                               const string& worker_name) override;
+  BaseRemoteRendezvous* Create(int64 step_id,
+                               const WorkerEnv* worker_env) override;
 
  private:
   RdmaMgr* rdma_mgr_;
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index b061c81d2d898907896ef6eb33ec8607c32ef1ed..c3597249354491186d0f654207b93c5e42559348 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -27,10 +27,8 @@ namespace tensorflow {
 
 namespace {
 // static utility function
-RendezvousMgrInterface* NewRdmaRendezvousMgr(
-    const WorkerEnv* env, const string& worker_name,
-    WorkerCacheInterface* worker_cache) {
-  return new RdmaRendezvousMgr(env, worker_name, worker_cache);
+RendezvousMgrInterface* NewRdmaRendezvousMgr(const WorkerEnv* env) {
+  return new RdmaRendezvousMgr(env);
 }
 
 }  // namespace
@@ -56,7 +54,7 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
   TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
 
   *channel_cache =
-      NewGrpcChannelCache(channel_spec, GetChannelCreationFunction(server_def));
+      NewGrpcChannelCache(channel_spec, GetChannelCreationFunction());
 
   const string host_port = (*channel_cache)->TranslateTask(name_prefix);
   int requested_port;
@@ -86,11 +84,7 @@ Status VerbsServer::Init(ServiceInitFunction service_func,
     rdma_mgr_ = new RdmaMgr(worker_env(), channel_cache_);
     // set rdma_mgr for verbs_service and rdma_rendezvous_mgr
     verbs_service_->SetRdmaMgr(rdma_mgr_);
-    // hardcoded to default session (legacy_session_)
-    // TODO: use WorkerSessionForSession
-    // need to pass in session handle
-    dynamic_cast<RdmaRendezvousMgr*>(
-        worker_env()->session_mgr->LegacySession()->rendezvous_mgr.get())
+    dynamic_cast<RdmaRendezvousMgr*>(worker_env()->rendezvous_mgr)
         ->SetRdmaMgr(rdma_mgr_);
   }
   return s;
diff --git a/tensorflow/contrib/verbs/verbs_service.proto b/tensorflow/contrib/verbs/verbs_service.proto
index b985febfb8c35a351a449de5a79971c4ad562f77..0df1fed4b9de81d7d99be3de9fba4be8b88ad404 100644
--- a/tensorflow/contrib/verbs/verbs_service.proto
+++ b/tensorflow/contrib/verbs/verbs_service.proto
@@ -30,6 +30,8 @@ message Channel {
   int32 lid = 1;
   int32 qpn = 2;
   int32 psn = 3;
+  uint64 snp = 4;
+  uint64 iid = 5;
 }
 
 message MemoryRegion {
diff --git a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc
index f0dabc08a43821f0fcb5279f62d4a9bb07e34ef8..302aa6457ab08a30bca9c28a5f162331111c4b77 100644
--- a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc
+++ b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc
@@ -195,7 +195,6 @@ Status ConvertOpRequestToXlaNode(const xla::OperationRequest& operation_request,
 }
 
 void SetupXlaCpuClient(std::unique_ptr<FunctionLibraryDefinition>* flib_def,
-                       std::unique_ptr<FunctionLibraryRuntime>* flr,
                        std::unique_ptr<XlaCompiler>* compiler) {
   xla::Client* client = xla::ClientLibrary::LocalClientOrDie();
   XlaOpRegistry::RegisterCompilationKernels();
@@ -205,14 +204,11 @@ void SetupXlaCpuClient(std::unique_ptr<FunctionLibraryDefinition>* flib_def,
 
   // Setup compiler options
   XlaCompiler::Options options;
-  options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+  DeviceType device_type(DEVICE_CPU_XLA_JIT);
+  options.device_type = &device_type;
+  options.flib_def = flib_def->get();
   options.client = client;
   compiler->reset(new XlaCompiler(options));
-
-  flr->reset(NewFunctionLibraryRuntime(
-      compiler->get()->device_mgr(), /*env=*/nullptr, compiler->get()->device(),
-      TF_GRAPH_DEF_VERSION, flib_def->get(), OptimizerOptions(),
-      /*custom_kernel_creator=*/nullptr));
 }
 
 }  // namespace
@@ -223,17 +219,16 @@ ConvertTfGraphToXlaSessionModule(const std::vector<XlaCompiler::Argument>& args,
   CHECK(graph);
 
   std::unique_ptr<FunctionLibraryDefinition> flib_def;
-  std::unique_ptr<FunctionLibraryRuntime> flr;
   std::unique_ptr<XlaCompiler> compiler;
 
-  SetupXlaCpuClient(&flib_def, &flr, &compiler);
+  SetupXlaCpuClient(&flib_def, &compiler);
 
   // Compile graph and build computation
   XlaCompiler::CompilationResult result;
-  TF_CHECK_OK(compiler->CompileGraph(GRAPH_NAME, std::move(graph), flr.get(),
-                                     args, &result));
+  TF_CHECK_OK(compiler->CompileGraph(XlaCompiler::CompileOptions(), GRAPH_NAME,
+                                     std::move(graph), args, &result));
 
-  return result.computation.Snapshot();
+  return result.computation->Snapshot();
 }
 
 xla::StatusOr<std::unordered_map<int64, XlaNode>>
diff --git a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
index 23649957f3a6ed233a4b5a343ee7f4ec64174150..beb4c8009b8e6f137af6b0d76275d8301a80460f 100644
--- a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
+++ b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
@@ -67,22 +67,19 @@ static void DumpHloGraphForDebug(const std::vector<XlaCompiler::Argument>& args,
 
   // Compiles the graph.
   XlaCompiler::Options options;
-  options.device_type = DeviceType("XLA_CPU_JIT");
+  DeviceType device_type("XLA_CPU_JIT");
+  options.device_type = &device_type;
   options.client = client;
+  options.flib_def = flib_def.get();
   compiler.reset(new XlaCompiler(options));
 
-  flr.reset(NewFunctionLibraryRuntime(compiler->device_mgr(), /*env=*/nullptr,
-                                      compiler->device(), TF_GRAPH_DEF_VERSION,
-                                      flib_def.get(), OptimizerOptions(),
-                                      /*custom_kernel_creator=*/nullptr));
-
   // Compile graph
   XlaCompiler::CompilationResult result;
-  TF_CHECK_OK(compiler->CompileGraph("dump", std::move(graph), flr.get(), args,
-                                     &result));
+  TF_CHECK_OK(compiler->CompileGraph(XlaCompiler::CompileOptions(), "dump",
+                                     std::move(graph), args, &result));
 
   // Convert to hlo
-  xla::Computation& computation = result.computation;
+  xla::Computation& computation = *result.computation;
 
   xla::Service* service(
       static_cast<xla::Service*>(xla::ClientLibrary::GetXlaService(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 0d2b0192e11f1e88c7bbd9081d67fd56610e7792..0006aaa0b5f140fc01e2f5097d8f0a8d251fa4d9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -154,7 +154,9 @@ CORE_PROTO_SRCS = [
     "framework/versions.proto",
     "lib/core/error_codes.proto",
     "protobuf/config.proto",
+    "protobuf/cluster.proto",
     "protobuf/debug.proto",
+    "protobuf/device_properties.proto",
     "protobuf/queue_runner.proto",
     "protobuf/rewriter_config.proto",
     "protobuf/tensor_bundle.proto",
@@ -506,6 +508,7 @@ tf_gen_op_libs(
         "image_ops",
         "io_ops",
         "linalg_ops",
+        "lookup_ops",
         "logging_ops",
         "math_ops",
         "nn_ops",
@@ -582,6 +585,7 @@ cc_library(
         ":image_ops_op_lib",
         ":io_ops_op_lib",
         ":linalg_ops_op_lib",
+        ":lookup_ops_op_lib",
         ":logging_ops_op_lib",
         ":math_ops_op_lib",
         ":nn_ops_op_lib",
@@ -708,6 +712,7 @@ cc_library(
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:linalg",
+        "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
@@ -736,6 +741,7 @@ cc_library(
         "//tensorflow/core/kernels:mkl_concat_op",
         "//tensorflow/core/kernels:mkl_conv_op",
         "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels:mkl_identity_op",
         "//tensorflow/core/kernels:mkl_lrn_op",
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
@@ -848,7 +854,6 @@ filegroup(
         "//tensorflow/core/platform/default/build_config:android_srcs",
         "//tensorflow/core/util/ctc:android_srcs",
         "//tensorflow/core/util/tensor_bundle:android_srcs",
-        "//tensorflow/core/grappler:android_srcs",
         "common_runtime/gpu/gpu_tracer.cc",
         "common_runtime/gpu/gpu_tracer.h",
     ] + glob(
@@ -1014,6 +1019,27 @@ cc_library(
     alwayslink = 1,
 )
 
+# Android library for use with the SELECTIVE_REGISTRATION feature with
+# no proto_rtti.
+cc_library(
+    name = "android_tensorflow_lib_selective_registration_nortti",
+    srcs = if_android(["//tensorflow/core:android_srcs"]),
+    copts = tf_copts() + tf_opts_nortti_if_android() + [
+        "-Os",
+        "-DSUPPORT_SELECTIVE_REGISTRATION",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":protos_cc",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 filegroup(
     name = "android_op_registrations_and_gradients",
     srcs = glob(
@@ -1282,7 +1308,10 @@ cc_library(
     ] + tf_additional_verbs_lib_defines(),
     linkopts = select({
         "//tensorflow:freebsd": [],
-        "//conditions:default": ["-ldl"],
+        "//conditions:default": [
+            "-ldl",
+            "-lpthread",
+        ],
     }),
     deps = tf_additional_lib_deps() + [
         ":lib_hash_crc32c_accelerate_internal",
@@ -1379,6 +1408,11 @@ tf_cuda_library(
             "framework/**/*.cc",
             "util/**/*.h",
             "util/**/*.cc",
+        ] + [
+            "graph/edgeset.h",
+            "graph/edgeset.cc",
+            "graph/graph.h",
+            "graph/graph.cc",
         ],
         exclude = [
             "**/*test*",
@@ -1483,42 +1517,151 @@ cc_library(
     deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
 )
 
+CORE_CPU_BASE_HDRS = [
+    "common_runtime/device.h",
+    "common_runtime/graph_runner.h",
+    "common_runtime/shape_refiner.h",
+    "framework/versions.h",
+    "graph/algorithm.h",
+    "graph/colors.h",
+    "graph/control_flow.h",
+    "graph/costmodel.h",
+    "graph/default_device.h",
+    "graph/edgeset.h",
+    "graph/graph.h",
+    "graph/graph_constructor.h",
+    "graph/graph_def_builder.h",
+    "graph/graph_partition.h",
+    "graph/mkl_layout_pass.h",
+    "graph/mkl_tfconversion_pass.h",
+    "graph/node_builder.h",
+    "graph/optimizer_cse.h",
+    "graph/subgraph.h",
+    "graph/tensor_id.h",
+    "graph/testlib.h",
+    "graph/types.h",
+    "graph/validate.h",
+]
+
+tf_cuda_library(
+    name = "core_cpu_base",
+    srcs = [
+        "common_runtime/shape_refiner.cc",
+        "common_runtime/shape_refiner.h",
+        "framework/versions.h",
+        "graph/algorithm.cc",
+        "graph/colors.cc",
+        "graph/control_flow.cc",
+        "graph/costmodel.cc",
+        "graph/graph_constructor.cc",
+        "graph/graph_def_builder.cc",
+        "graph/graph_partition.cc",
+        "graph/mkl_layout_pass.cc",
+        "graph/mkl_tfconversion_pass.cc",
+        "graph/node_builder.cc",
+        "graph/optimizer_cse.cc",
+        "graph/subgraph.cc",
+        "graph/tensor_id.cc",
+        "graph/validate.cc",
+        "public/session.h",
+        "public/session_options.h",
+        "public/version.h",
+    ],
+    hdrs = CORE_CPU_BASE_HDRS,
+    copts = tf_copts(),
+    deps = [
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":proto_text",
+        ":protos_all_cc",
+        "//tensorflow/core/kernels:required",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 tf_cuda_library(
     name = "core_cpu_internal",
-    srcs = glob(
-        [
-            "client/**/*.cc",
-            "common_runtime/*.h",
-            "common_runtime/*.cc",
-            "framework/versions.h",
-            "graph/**/*.h",
-            "graph/**/*.cc",
-            "public/session.h",
-            "public/session_options.h",
-            "public/version.h",
-        ],
-        exclude = [
-            "**/*test*",
-            "**/*main.cc",
-            "common_runtime/direct_session.cc",
-            "common_runtime/direct_session.h",
-            "common_runtime/gpu_device_context.h",
-        ],
-    ),
-    hdrs = glob(
-        [
-            "common_runtime/*.h",
-            "framework/versions.h",
-            "graph/**/*.h",
-        ],
-        exclude = [
-            "**/*test*",
-            "common_runtime/direct_session.h",
-            "common_runtime/gpu_device_context.h",
-        ],
-    ),
+    srcs = [
+        "common_runtime/allocator_retry.cc",
+        "common_runtime/bfc_allocator.cc",
+        "common_runtime/build_graph_options.cc",
+        "common_runtime/constant_folding.cc",
+        "common_runtime/copy_tensor.cc",
+        "common_runtime/costmodel_manager.cc",
+        "common_runtime/debugger_state_interface.cc",
+        "common_runtime/device.cc",
+        "common_runtime/device_factory.cc",
+        "common_runtime/device_mgr.cc",
+        "common_runtime/device_set.cc",
+        "common_runtime/executor.cc",
+        "common_runtime/function.cc",
+        "common_runtime/graph_optimizer.cc",
+        "common_runtime/graph_runner.cc",
+        "common_runtime/local_device.cc",
+        "common_runtime/memory_types.cc",
+        "common_runtime/optimization_registry.cc",
+        "common_runtime/parallel_concat_optimizer.cc",
+        "common_runtime/process_util.cc",
+        "common_runtime/renamed_device.cc",
+        "common_runtime/rendezvous_mgr.cc",
+        "common_runtime/resource_variable_read_optimizer.cc",
+        "common_runtime/session.cc",
+        "common_runtime/session_factory.cc",
+        "common_runtime/session_options.cc",
+        "common_runtime/session_state.cc",
+        "common_runtime/simple_graph_execution_state.cc",
+        "common_runtime/simple_placer.cc",
+        "common_runtime/stats_publisher_interface.cc",
+        "common_runtime/step_stats_collector.cc",
+        "common_runtime/threadpool_device.cc",
+        "common_runtime/threadpool_device_factory.cc",
+        "graph/gradients.cc",
+        "graph/quantize_training.cc",
+        "public/session.h",
+        "public/session_options.h",
+        "public/version.h",
+    ],
+    hdrs = CORE_CPU_BASE_HDRS + [
+        "common_runtime/allocator_retry.h",
+        "common_runtime/bfc_allocator.h",
+        "common_runtime/build_graph_options.h",
+        "common_runtime/constant_folding.h",
+        "common_runtime/copy_tensor.h",
+        "common_runtime/costmodel_manager.h",
+        "common_runtime/debugger_state_interface.h",
+        "common_runtime/device_factory.h",
+        "common_runtime/device_mgr.h",
+        "common_runtime/device_set.h",
+        "common_runtime/dma_helper.h",
+        "common_runtime/eigen_thread_pool.h",
+        "common_runtime/executor.h",
+        "common_runtime/function.h",
+        "common_runtime/graph_optimizer.h",
+        "common_runtime/local_device.h",
+        "common_runtime/memory_types.h",
+        "common_runtime/mkl_cpu_allocator.h",
+        "common_runtime/optimization_registry.h",
+        "common_runtime/pending_counts.h",
+        "common_runtime/process_util.h",
+        "common_runtime/profile_handler.h",
+        "common_runtime/renamed_device.h",
+        "common_runtime/rendezvous_mgr.h",
+        "common_runtime/session_factory.h",
+        "common_runtime/simple_graph_execution_state.h",
+        "common_runtime/simple_placer.h",
+        "common_runtime/stats_publisher_interface.h",
+        "common_runtime/step_stats_collector.h",
+        "common_runtime/threadpool_device.h",
+        "common_runtime/visitable_allocator.h",
+        "graph/gradients.h",
+        "graph/quantize_training.h",
+    ],
     copts = tf_copts(),
     deps = [
+               ":core_cpu_base",
                ":framework",
                ":framework_internal",
                ":function_ops_op_lib",
@@ -1529,6 +1672,8 @@ tf_cuda_library(
                ":proto_text",
                ":protos_all_cc",
                "//tensorflow/core/grappler:grappler_item",
+               "//tensorflow/core/grappler/clusters:utils",
+               "//tensorflow/core/grappler/clusters:virtual_cluster",
                "//tensorflow/core/grappler/optimizers:meta_optimizer",
                "//third_party/eigen3",
                "//tensorflow/core/kernels:required",
@@ -1570,6 +1715,7 @@ tf_cuda_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
+        "//tensorflow/core/debug:debug_graph_utils",
         "//tensorflow/core/kernels:function_ops",
     ],
     alwayslink = 1,
@@ -2119,6 +2265,7 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels:mkl_concat_op",
         "//tensorflow/core/kernels:mkl_conv_op",
         "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels:mkl_identity_op",
         "//tensorflow/core/kernels:mkl_lrn_op",
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
@@ -2219,9 +2366,12 @@ tf_cc_test(
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/core/kernels:bcast_ops",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:concat_op",
+        "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:immutable_constant_op",
         "//tensorflow/core/kernels:matmul_op",
@@ -2413,6 +2563,9 @@ tf_cc_test(
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
@@ -2717,6 +2870,7 @@ filegroup(
         # -- hand-edited variant: stops after a restart marker
         "lib/jpeg/testdata/corrupt34_4.jpg",
         # GIF data
+        "lib/gif/testdata/lena.gif",
         "lib/gif/testdata/scan.gif",
         # GIF data with optimization
         "lib/gif/testdata/optimized.gif",
diff --git a/tensorflow/core/common_runtime/build_graph_options.h b/tensorflow/core/common_runtime/build_graph_options.h
index 49566c8fa8fda1d46a86860b2f8f59d91c81ac71..5f0e8f170b9e9b0c6a3094e475fcc3bbf47756ea 100644
--- a/tensorflow/core/common_runtime/build_graph_options.h
+++ b/tensorflow/core/common_runtime/build_graph_options.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
 
 namespace tensorflow {
 
@@ -35,6 +36,8 @@ struct BuildGraphOptions {
   // TODO(mrry): Remove this when the distributed runtime supports Arg/Retval.
   bool use_function_convention = false;
 
+  DebugOptions debug_options;
+
   string DebugString() const;
 };
 
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 3cd29c8e86e9dac3c8e36cbcafdc284437f4a8d9..8fa61d098eb702b6e0f7d06fdc57b2242f288627 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -44,6 +45,9 @@ namespace {
 
 bool IsConstantFoldable(const Node* n,
                         const std::function<bool(const Node*)>& consider) {
+  if (n->IsConstant()) {
+    return true;
+  }
   if (n->op_def().is_stateful()) {
     return false;
   }
@@ -78,50 +82,65 @@ bool IsConstantFoldable(const Node* n,
   return true;
 }
 
-// Returns the constant foldable nodes in `nodes_result` in data flow order.
-void FindConstantFoldableNodes(const Graph* graph,
-                               const FunctionLibraryDefinition* flib_def,
-                               ConstantFoldingOptions opts,
-                               std::vector<Node*>* nodes_result) {
-  std::set<const Node*> node_set;
-  std::vector<Node*>& nodes = *nodes_result;
+// Returns the constant foldable nodes in `nodes` in topological order.
+// Populates `constant_control_deps` with the non-constant control depedencies
+// of each constant node.
+void FindConstantFoldableNodes(
+    const Graph* graph, ConstantFoldingOptions opts, std::vector<Node*>* nodes,
+    std::unordered_map<const Node*, gtl::FlatSet<Node*>>*
+        constant_control_deps) {
   bool internal_node_inserted = false;
   // Walk the nodes in data flow order
-  ReverseDFS(*graph, nullptr, [&nodes, &node_set, &internal_node_inserted, opts,
-                               flib_def](Node* n) {
-    if (n->IsConstant()) {
-      // Constants with no control inputs (except from _SOURCE node)
-      // are definitely constant foldable.
-      if (n->in_edges().size() == 0 ||
-          (n->in_edges().size() == 1 &&
-           (*n->in_edges().begin())->src()->IsSource())) {
-        node_set.insert(n);
-        nodes.push_back(n);
-      }
-    } else if (IsConstantFoldable(n, opts.consider)) {
-      // Check whether the set of this node's in_nodes is completely
-      // included in the set of constant foldable nodes. If true,
-      // then this node is also constant foldable.
-      bool all_parents_constant = true;
-      for (const Node* parent : n->in_nodes()) {
-        if (node_set.count(parent) == 0 && !parent->IsSource()) {
-          all_parents_constant = false;
-          break;
+  ReverseDFS(
+      *graph, nullptr,
+      [nodes, constant_control_deps, &internal_node_inserted, opts](Node* n) {
+        if (IsConstantFoldable(n, opts.consider)) {
+          // A node is constant provided all of its non-control
+          // incoming Tensors come from constant nodes.
+          //
+          // We allow control dependencies from non-constant nodes to constant
+          // nodes, but to preserve the graph structure we must transfer the
+          // control dependency onto any constant replacement.
+          bool all_parents_constant = true;
+          for (const Edge* in : n->in_edges()) {
+            // Allows non-constant -> constant control edges.
+            if (!in->IsControlEdge() &&
+                constant_control_deps->count(in->src()) == 0) {
+              all_parents_constant = false;
+              break;
+            }
+          }
+          if (all_parents_constant) {
+            gtl::FlatSet<Node*>& control_deps = (*constant_control_deps)[n];
+            for (const Edge* e : n->in_edges()) {
+              if (constant_control_deps->count(e->src()) == 0) {
+                if (!e->src()->IsSource()) {
+                  control_deps.insert(e->src());
+                }
+              } else {
+                // If the parent is constant, add all of its transitive control
+                // deps.
+                const gtl::FlatSet<Node*>& parent_deps =
+                    (*constant_control_deps)[e->src()];
+                control_deps.insert(parent_deps.begin(), parent_deps.end());
+              }
+            }
+            nodes->push_back(n);
+            if (!n->IsConstant()) {
+              internal_node_inserted = true;
+            }
+          }
         }
-      }
-      if (all_parents_constant) {
-        node_set.insert(n);
-        nodes.push_back(n);
-        internal_node_inserted = true;
-      }
-    }
-  });
+      });
   // If we have inserted just leaf level nodes, then there is nothing to fold.
   if (!internal_node_inserted) {
-    nodes.clear();
+    nodes->clear();
+    constant_control_deps->clear();
   }
 }
 
+typedef std::pair<Node*, int> NodeAndOutput;
+
 // Given the constant foldable nodes in 'nodes', returns a new graph 'g'. 'g'
 // will contain copies of the nodes in 'nodes'. In addition, if there is an edge
 // going from a node 'n' in 'nodes' to another node in 'orig_graph' but not in
@@ -132,23 +151,21 @@ Graph* GetConstantGraph(const Graph* orig_graph,
                         std::map<NodeAndOutput, Node*>* tensors_to_fetch) {
   Graph* constant_graph = new Graph(orig_graph->op_registry());
   std::unordered_map<Node*, Node*> node_map;
-  std::set<Node*> already_added;
-  already_added.insert(constant_graph->source_node());
-  already_added.insert(constant_graph->sink_node());
   node_map[orig_graph->source_node()] = constant_graph->source_node();
   node_map[orig_graph->sink_node()] = constant_graph->sink_node();
   for (Node* n : nodes) {
     Node* added = constant_graph->CopyNode(n);
     node_map[n] = added;
-    already_added.insert(added);
     for (const Edge* in_edge : n->in_edges()) {
-      Node* in = in_edge->src();
-      CHECK_GT(node_map.count(in), size_t{0}) << n->DebugString() << " <-"
-                                              << in->DebugString();
-      CHECK_GT(already_added.count(node_map[in]), size_t{0})
-          << in->DebugString();
-      constant_graph->AddEdge(node_map[in], in_edge->src_output(), added,
-                              in_edge->dst_input());
+      // Don't copy control edges to the constant graph.
+      if (!in_edge->IsControlEdge()) {
+        Node* in = in_edge->src();
+        auto it = node_map.find(in);
+        CHECK(it != node_map.end())
+            << n->DebugString() << " <-" << in->DebugString();
+        constant_graph->AddEdge(it->second, in_edge->src_output(), added,
+                                in_edge->dst_input());
+      }
     }
   }
 
@@ -170,10 +187,15 @@ int64 UniqueConstantId() {
   return id.fetch_add(1);
 }
 
-}  // namespace
-
+// Replaces the identified Tensor in 'graph' by a 'Const' node with
+// the value supplied in 'constant'. 'partition_device', if non-null
+// is the device where the graph executes. Returns true if the
+// replacement was successful, false otherwise.
+// 'control_deps' is the set of nodes that should be control predecessors of the
+// new constant node.
 bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
-                               NodeAndOutput tensor, const Tensor& constant) {
+                               NodeAndOutput tensor, const Tensor& constant,
+                               const gtl::FlatSet<Node*>& control_deps) {
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
@@ -237,8 +259,8 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
     return false;
   }
 
-  VLOG(1) << "Replacing " << tensor.first->DebugString()
-          << " :: " << tensor.second << " with a constant";
+  VLOG(1) << "Replacing " << tensor.first->name() << " :: " << tensor.second
+          << " with a constant";
 
   if (!NodeBuilder(builder).Finalize(graph, &constant_node).ok()) {
     return false;
@@ -247,35 +269,30 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
     graph->AddEdge(constant_node, 0, edge->dst(), edge->dst_input());
     graph->RemoveEdge(edge);
   }
-  graph->AddEdge(graph->source_node(), -1, constant_node, -1);
+  if (control_deps.empty()) {
+    graph->AddControlEdge(graph->source_node(), constant_node);
+  } else {
+    for (Node* node : control_deps) {
+      graph->AddControlEdge(node, constant_node);
+    }
+  }
   if (partition_device) {
     constant_node->set_assigned_device_name(partition_device->name());
   }
   return true;
 }
 
-bool DoConstantFolding(const ConstantFoldingOptions& opts,
-                       FunctionLibraryRuntime* function_library, Env* env,
-                       Device* partition_device, Graph* graph) {
-  bool was_mutated;
-  Status unused_status = DoConstantFoldingWithStatus(
-      opts, function_library, env, partition_device, graph, &was_mutated);
-  return was_mutated;
-}
+}  // namespace
 
-Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
-                                   FunctionLibraryRuntime* function_library,
-                                   Env* env, Device* partition_device,
-                                   Graph* graph, bool* was_mutated) {
+Status ConstantFold(const ConstantFoldingOptions& opts,
+                    FunctionLibraryRuntime* function_library, Env* env,
+                    Device* partition_device, Graph* graph, bool* was_mutated) {
   DumpGraph("Before", graph);
 
-  const FunctionLibraryDefinition* flib_def = nullptr;
-  if (function_library) {
-    flib_def = function_library->GetFunctionLibraryDefinition();
-  }
-
   std::vector<Node*> constant_foldable_nodes;
-  FindConstantFoldableNodes(graph, flib_def, opts, &constant_foldable_nodes);
+  std::unordered_map<const Node*, gtl::FlatSet<Node*>> constant_control_deps;
+  FindConstantFoldableNodes(graph, opts, &constant_foldable_nodes,
+                            &constant_control_deps);
   if (constant_foldable_nodes.empty()) {
     VLOG(1) << "No constant foldable nodes found";
     *was_mutated = false;
@@ -328,8 +345,11 @@ Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
   // original graph with those constants.
   int32 num_nodes_replaced = 0;
   for (size_t c = 0; c < outputs.size(); ++c) {
+    const gtl::FlatSet<Node*>& control_deps =
+        constant_control_deps[tensors_to_replace[c].first];
     if (ReplaceTensorWithConstant(graph, partition_device,
-                                  tensors_to_replace[c], outputs[c])) {
+                                  tensors_to_replace[c], outputs[c],
+                                  control_deps)) {
       ++num_nodes_replaced;
     }
   }
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index 9e3479e50b094be7d3829f78629319c1e2a422ac..93289b875f5266558baecf1df3308c6430e04a9a 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,12 +17,20 @@ limitations under the License.
 #define TENSORFLOW_COMMON_RUNTIME_CONSTANT_FOLDING_H_
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
+// Options specific to constant folding optimizations.
+struct ConstantFoldingOptions {
+  // If "consider" is not a nullptr, then only constant fold a node "n" if
+  // consider(n) returns true.
+  std::function<bool(const Node*)> consider = nullptr;
+};
+
 // Perform constant folding optimization on "graph".
 // Looks for nodes in "graph" that can be completely evaluated statically, i.e.,
 // that are only dependent on constants. Evaluates those nodes on a CPU device
@@ -32,25 +40,9 @@ namespace tensorflow {
 // Sets `was_mutated` to true if and only if "graph" has been mutated.
 // The status is only set to a non-OK state if an unexpected error is hit
 // running the graph.
-Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
-                                   FunctionLibraryRuntime* function_library,
-                                   Env* env, Device* partition_device,
-                                   Graph* graph, bool* was_mutated);
-
-// Version of the function that doesn't return a Status, for backwards
-// compatibility.
-bool DoConstantFolding(const ConstantFoldingOptions& opts,
-                       FunctionLibraryRuntime* function_library, Env* env,
-                       Device* partition_device, Graph* graph);
-
-typedef std::pair<Node*, int> NodeAndOutput;
-
-// Replaces the identified Tensor in 'graph' by a 'Const' node with
-// the value supplied in 'constant'. 'partition_device', if non-null
-// is the device where the graph executes. Returns true if the
-// replacement was successful, false otherwise.
-bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
-                               NodeAndOutput tensor, const Tensor& constant);
+Status ConstantFold(const ConstantFoldingOptions& opts,
+                    FunctionLibraryRuntime* function_library, Env* env,
+                    Device* partition_device, Graph* graph, bool* was_mutated);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 3a9bdfe14419945a1ded24783015cb54d64d28b7..4a8560960ed522995f78d56f6ab092cbcb65d9a9 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
 
+#include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -30,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -42,27 +43,14 @@ namespace {
 
 class ConstantFoldingTest : public ::testing::Test {
  protected:
-  ConstantFoldingTest() { Reset(); }
-  void Reset() { g_.reset(new Graph(OpRegistry::Global())); }
-
-  template <typename T>
-  Node* Constant(gtl::ArraySlice<T> values, TensorShape shape) {
-    return test::graph::Constant(g_.get(), test::AsTensor(values, shape));
-  }
-
-  template <typename T>
-  Node* Constant(T v) {
-    return test::graph::Constant(g_.get(), test::AsScalar(v));
-  }
-
   template <typename T>
   void ExpectNodeClose(const Node* n, gtl::ArraySlice<T> values,
                        TensorShape shape) {
     EXPECT_TRUE(n->IsConstant());
     const TensorProto* tensor_proto;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor_proto));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor_proto));
     DataType dtype;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
     Tensor t(dtype);
     EXPECT_TRUE(t.FromProto(*tensor_proto));
     test::ExpectClose(t, test::AsTensor(values, shape));
@@ -73,46 +61,57 @@ class ConstantFoldingTest : public ::testing::Test {
                        TensorShape shape) {
     EXPECT_TRUE(n->IsConstant());
     const TensorProto* tensor_proto;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor_proto));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor_proto));
     DataType dtype;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
     Tensor t(dtype);
     EXPECT_TRUE(t.FromProto(*tensor_proto));
     test::ExpectTensorEqual<T>(t, test::AsTensor(values, shape));
   }
 
-// Construct the following graph
-/*
-      s1  s2
-      |    |
-      m1   m2
-      / \ / \
-     a   b   c
-*/
-#define SIMPLE_GRAPH                                                  \
-  Reset();                                                            \
-  Graph* g = g_.get();                                                \
-  Node* a = Constant<float>({1.0, 0.0, 0.0, 1.0}, {2, 2});            \
-  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});            \
-  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});            \
-  g->AddControlEdge(g->source_node(), a);                             \
-  g->AddControlEdge(g->source_node(), b);                             \
-  g->AddControlEdge(g->source_node(), c);                             \
-  Node* m1 = test::graph::Matmul(g, a, b, false, false);              \
-  Node* s1 = test::graph::Send(g, m1, "m1", "sender", 0, "receiver"); \
-  Node* m2 = test::graph::Matmul(g, b, c, false, false);              \
-  Node* s2 = test::graph::Send(g, m2, "m2", "sender", 0, "receiver"); \
-  g->AddControlEdge(s1, g->sink_node());                              \
-  g->AddControlEdge(s2, g->sink_node());
-
-  std::unique_ptr<Graph> g_;
+  // Builds a map from node name to Node* for `graph`.
+  std::unordered_map<string, Node*> NodeNameIndex(const Graph& graph) {
+    std::unordered_map<string, Node*> index;
+    for (Node* node : graph.nodes()) {
+      index[node->name()] = node;
+    }
+    return index;
+  }
+
+  // Constructs the following graph.
+  /*
+        s1  s2
+        |    |
+        m1   m2
+        / \ / \
+       a   b   c
+  */
+  void BuildSimpleGraph(Scope* scope) {
+    Scope& s = *scope;
+    auto a = ops::Const<float>(s, {1.0, 0.0, 0.0, 1.0}, {2, 2});
+    auto b = ops::Const<float>(s, {1.0, 2.0, 3.0, 4.0}, {2, 2});
+    auto c = ops::Const<float>(s, {0.0, 1.0, 1.0, 0.0}, {2, 2});
+    auto m1 = ops::MatMul(s, a, b);
+    auto s1 = ops::_Send(s.WithOpName("s1"), m1, "m1", "sender", 0, "receiver");
+    auto m2 = ops::MatMul(s.WithOpName("m2"), b, c);
+    auto s2 = ops::_Send(s.WithOpName("s2"), m2, "m2", "sender", 0, "receiver");
+  }
 };
 
 TEST_F(ConstantFoldingTest, Basic) {
-  SIMPLE_GRAPH;
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr,
-                                Env::Default(), nullptr, g));
+  Scope s = Scope::NewRootScope();
+  BuildSimpleGraph(&s);
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(s.ToGraph(&g));
 
+  bool was_mutated;
+  TF_ASSERT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* s1 = index.at("s1");
+  Node* s2 = index.at("s2");
   // Nodes s1 and s2 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
   ExpectNodeClose<float>(*(s1->in_nodes().begin()), {1.0, 2.0, 3.0, 4.0},
@@ -123,11 +122,23 @@ TEST_F(ConstantFoldingTest, Basic) {
 }
 
 TEST_F(ConstantFoldingTest, ConsiderFunction) {
-  SIMPLE_GRAPH;
+  Scope s = Scope::NewRootScope();
+  BuildSimpleGraph(&s);
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(s.ToGraph(&g));
+
   ConstantFoldingOptions opts;
   // Do not allow constant folding of m2
-  opts.consider = [m2](const Node* n) { return m2 != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, Env::Default(), nullptr, g));
+  opts.consider = [](const Node* n) { return "m2" != n->name(); };
+  bool was_mutated;
+  TF_ASSERT_OK(
+      ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* s1 = index.at("s1");
+  Node* s2 = index.at("s2");
+  Node* m2 = index.at("m2");
 
   // Node s1 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
@@ -139,40 +150,52 @@ TEST_F(ConstantFoldingTest, ConsiderFunction) {
 }
 
 TEST_F(ConstantFoldingTest, TestNoReplaceAnotherConstant) {
-  SIMPLE_GRAPH;
-  Node* d = Constant<float>({1.0, 0.0, 0.0, 1.0}, {2, 2});
-  g->AddControlEdge(g->source_node(), d);
-  Node* s3 = test::graph::Send(g, d, "d", "sender", 0, "receiver");
-  g->AddControlEdge(s3, g->sink_node());
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr,
-                                Env::Default(), nullptr, g));
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    BuildSimpleGraph(&s);
+    auto d = ops::Const<float>(s.WithOpName("d"), {1.0, 0.0, 0.0, 1.0}, {2, 2});
+    auto s3 = ops::_Send(s.WithOpName("s3"), d, "d", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+
+  bool was_mutated;
+  TF_ASSERT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* d = index.at("d");
+  Node* s3 = index.at("s3");
 
   // Nodes s3 should still have d as input
   EXPECT_EQ(1, s3->num_inputs());
   EXPECT_EQ(*(s3->in_nodes().begin()), d);
 }
 
-#undef SIMPLE_GRAPH
-
 TEST_F(ConstantFoldingTest, TwoOutputs) {
-  Reset();
-  Graph* g = g_.get();
-  Node* s0 = Constant<int>({1}, {1});
-  Node* s1 = Constant<int>({2, 2}, {2});
-  g->AddControlEdge(g->source_node(), s0);
-  g->AddControlEdge(g->source_node(), s1);
-  Node* b = test::graph::BroadcastGradientArgs(g, s0, s1);
-  Node* b0 = test::graph::Send(g, test::graph::Identity(g, b, 0),
-                               strings::StrCat(b->name(), "0"), "sender", 0,
-                               "receiver");
-  Node* b1 = test::graph::Send(g, test::graph::Identity(g, b, 1),
-                               strings::StrCat(b->name(), "1"), "sender", 0,
-                               "receiver");
-  g->AddControlEdge(b0, g->sink_node());
-  g->AddControlEdge(b1, g->sink_node());
-
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr,
-                                Env::Default(), nullptr, g));
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto s0 = ops::Const<int>(s, {1}, {1});
+    auto s1 = ops::Const<int>(s, {2, 2}, {2});
+    auto b = ops::internal::BroadcastGradientArgs(s, s0, s1);
+    auto b0 = ops::_Send(s.WithOpName("b0"), ops::Identity(s, b.r0), "b0",
+                         "sender", 0, "receiver");
+    auto b1 = ops::_Send(s.WithOpName("b1"), ops::Identity(s, b.r1), "b1",
+                         "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+
+  bool was_mutated;
+  TF_ASSERT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* b0 = index.at("b0");
+  Node* b1 = index.at("b1");
+
   EXPECT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
   EXPECT_EQ(1, b1->num_inputs());
@@ -180,126 +203,164 @@ TEST_F(ConstantFoldingTest, TwoOutputs) {
 }
 
 TEST_F(ConstantFoldingTest, TwoOutputsFoldOneOutput) {
-  Reset();
-  Graph* g = g_.get();
-  Node* s0 = Constant<int>({1}, {1});
-  Node* s1 = Constant<int>({2, 2}, {2});
-  g->AddControlEdge(g->source_node(), s0);
-  g->AddControlEdge(g->source_node(), s1);
-  Node* b = test::graph::BroadcastGradientArgs(g, s0, s1);
-  Node* b0 = test::graph::Send(g, test::graph::Identity(g, b, 0),
-                               strings::StrCat(b->name(), "0"), "sender", 0,
-                               "receiver");
-  Node* b1_ident = test::graph::Identity(g, b, 1);
-  Node* b1 = test::graph::Send(g, b1_ident, strings::StrCat(b->name(), "1"),
-                               "sender", 0, "receiver");
-  g->AddControlEdge(b0, g->sink_node());
-  g->AddControlEdge(b1, g->sink_node());
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto s0 = ops::Const<int>(s, {1}, {1});
+    auto s1 = ops::Const<int>(s, {2, 2}, {2});
+    auto b = ops::internal::BroadcastGradientArgs(s, s0, s1);
+    auto b0 = ops::_Send(s.WithOpName("b0"), ops::Identity(s, b.r0), "b0",
+                         "sender", 0, "receiver");
+    auto b1_ident = ops::Identity(s.WithOpName("b1_ident"), b.r1);
+    auto b1 =
+        ops::_Send(s.WithOpName("b1"), b1_ident, "b1", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   ConstantFoldingOptions opts;
-  opts.consider = [b1_ident](const Node* n) { return b1_ident != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, Env::Default(), nullptr, g));
+  opts.consider = [](const Node* n) { return "b1_ident" != n->name(); };
+  bool was_mutated;
+  TF_ASSERT_OK(
+      ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* b0 = index.at("b0");
+  Node* b1 = index.at("b1");
+  Node* b1_ident = index.at("b1_ident");
+
   // 0th output of b should have been folded.
-  EXPECT_EQ(1, b0->num_inputs());
+  ASSERT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
   // 1st output of b should still be b1_ident. However, b1_ident's input must
   // have been replaced with a constant.
-  EXPECT_EQ(1, b1->num_inputs());
+  ASSERT_EQ(1, b1->num_inputs());
   EXPECT_EQ(*(b1->in_nodes().begin()), b1_ident);
 
-  EXPECT_EQ(1, b1_ident->num_inputs());
+  ASSERT_EQ(1, b1_ident->num_inputs());
   ExpectNodeEqual<int>(*(b1_ident->in_nodes().begin()), {}, {0});
 }
 
 TEST_F(ConstantFoldingTest, TestNoReplaceLargeConstant) {
-  Reset();
-  Graph* g = g_.get();
-  Node* s0 =
-      Constant<int>(std::vector<int>(5 * 1024 * 256, 0), {5 * 1024 * 256});
-  Node* s1 = Constant<int>(std::vector<int>(5 * 1024 * 256 + 1, 0),
-                           {5 * 1024 * 256 + 1});
-  Node* concat_dim = Constant<int>(0);
-  g->AddControlEdge(g->source_node(), s0);
-  g->AddControlEdge(g->source_node(), s1);
-  // Concat s0 and s1. The resulting tensor would be of size 10M + 1 bytes
-  Node* concat = test::graph::Concat(g, concat_dim, {s0, s1});
-  Node* concat_send =
-      test::graph::Send(g, concat, "concat_send", "sender", 0, "receiver");
-  g->AddControlEdge(concat_send, g->sink_node());
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto s0 = ops::Const<int>(s, 0, {5 * 1024 * 256});
+    auto s1 = ops::Const<int>(s, 0, {5 * 1024 * 256 + 1});
+    auto concat_dim = ops::Const<int>(s, 0);
+    auto concat = ops::Concat(s, {s0, s1}, concat_dim);
+    auto concat_send = ops::_Send(s.WithOpName("concat_send"), concat,
+                                  "concat_send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   // The above concat should not have been constant folded.
   bool was_mutated;
-  Status status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
   EXPECT_FALSE(was_mutated);
-  TF_EXPECT_OK(status);
 }
 
 TEST_F(ConstantFoldingTest, TestNoReplaceFunctionCall) {
-  FunctionDefLibrary fdef_lib;
-  *fdef_lib.add_function() = test::function::XTimesTwo();
-
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
-  g_.reset(new Graph(&flib_def));
-
-  Graph* g = g_.get();
-  Node* s =
-      Constant<int>(std::vector<int>(5 * 1024 * 256, 0), {5 * 1024 * 256});
-  g->AddControlEdge(g->source_node(), s);
-
-  NodeDef def;
-  TF_ASSERT_OK(NodeDefBuilder("times_two", "XTimesTwo", g->op_registry())
-                   .Input(s->name(), 0, DT_INT32)
-                   .Finalize(&def));
-  Status status;
-  Node* times_two = g->AddNode(def, &status);
-  TF_ASSERT_OK(status);
-
-  Node* times_two_send = test::graph::Send(g, times_two, "times_two_send",
-                                           "sender", 0, "receiver");
-  g->AddControlEdge(times_two_send, g->sink_node());
+  FunctionDefLibrary flib;
+  *flib.add_function() = test::function::XTimesTwo();
+
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
+  Graph g(flib_def);
+  {
+    Scope s = Scope::NewRootScope();
+    auto c = ops::Const<int32>(s.WithOpName("c"), {1}, {1});
+    TF_EXPECT_OK(s.graph()->AddFunctionLibrary(flib));
+
+    // TODO(phawkins): there is no way to make a function call using the C++
+    // graph builder API.
+    NodeDef def;
+    TF_ASSERT_OK(
+        NodeDefBuilder("times_two", "XTimesTwo", s.graph()->op_registry())
+            .Input(c.name(), 0, DT_INT32)
+            .Finalize(&def));
+    Status status;
+    Node* times_two = s.graph()->AddNode(def, &status);
+    TF_ASSERT_OK(status);
+    s.graph()->AddEdge(c.node(), 0, times_two, 0);
+
+    auto times_two_send =
+        ops::_Send(s.WithOpName("times_two_send"), Output(times_two),
+                   "times_two_send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   // The above function call should not have been constant folded.
   bool was_mutated;
-  status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
   EXPECT_FALSE(was_mutated);
-  EXPECT_TRUE(status.ok());
-
-  g_ = nullptr;
 }
 
 REGISTER_OP("ConstantFoldingTestOp").Input("a: int64").Output("b: int64");
 
 TEST_F(ConstantFoldingTest, TestNoReplaceNonCPUOp) {
-  Graph* g = g_.get();
-
-  Node* aconst = Constant<int64>(std::vector<int64>(5, 0), {5});
-  g->AddControlEdge(g->source_node(), aconst);
-
-  NodeDef def;
-  TF_ASSERT_OK(
-      NodeDefBuilder("testop", "ConstantFoldingTestOp", g->op_registry())
-          .Input(aconst->name(), 0, DT_INT64)
-          .Finalize(&def));
-  Status status;
-  Node* non_cpu = g->AddNode(def, &status);
-  TF_ASSERT_OK(status);
-  g->AddEdge(aconst, 0, non_cpu, 0);
-
-  Node* non_cpu_send =
-      test::graph::Send(g, non_cpu, "non_cpu_send", "sender", 0, "receiver");
-  g->AddControlEdge(non_cpu_send, g->sink_node());
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto aconst = ops::Const<int64>(s, 0, {5});
+
+    NodeDef def;
+    TF_ASSERT_OK(NodeDefBuilder("testop", "ConstantFoldingTestOp")
+                     .Input(aconst.name(), 0, DT_INT64)
+                     .Finalize(&def));
+    Status status;
+    Node* non_cpu = s.graph()->AddNode(def, &status);
+    TF_ASSERT_OK(status);
+
+    auto non_cpu_send =
+        ops::_Send(s.WithOpName("non_cpu_send"), Output(non_cpu),
+                   "non_cpu_send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   // The non-CPU op should not have been constant folded.
   bool was_mutated;
-  status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
   EXPECT_FALSE(was_mutated);
-  EXPECT_TRUE(status.ok());
+}
+
+TEST_F(ConstantFoldingTest, ControlDependencies) {
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto c0 = ops::Const<int>(s, 1);
+    auto recv1 = ops::_Recv(s.WithOpName("recv1"), DT_FLOAT, "recv1", "sender",
+                            0, "receiver");
+    auto c1 = ops::Const<int>(s.WithControlDependencies(recv1), 2);
+    auto recv2 = ops::_Recv(s.WithOpName("recv2"), DT_FLOAT, "recv2", "sender",
+                            0, "receiver");
+    auto c2 = ops::Const<int>(s.WithControlDependencies(recv2), 3);
+    auto add = ops::Add(s.WithControlDependencies(c2), c0, c1);
+    auto send =
+        ops::_Send(s.WithOpName("send"), add, "send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+  bool was_mutated;
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* recv1 = index.at("recv1");
+  Node* recv2 = index.at("recv2");
+  Node* send = index.at("send");
+
+  ASSERT_EQ(1, send->num_inputs());
+  Node* p = *(send->in_nodes().begin());
+  ExpectNodeEqual<int>(p, {3}, {});
+
+  ASSERT_EQ(2, p->in_edges().size());
+  for (const Edge* e : p->in_edges()) {
+    EXPECT_TRUE(e->IsControlEdge());
+    EXPECT_TRUE(e->src() == recv1 || e->src() == recv2) << e->src()->name();
+  }
 }
 
 namespace {
@@ -365,8 +426,7 @@ class TestTFEnvironment : public ::tensorflow::EnvWrapper {
 }  // namespace
 
 TEST_F(ConstantFoldingTest, TestImmutableConst) {
-  Reset();
-  Graph* g = g_.get();
+  Graph g(OpRegistry::Global());
   Scope root = Scope::NewRootScope();
 
   auto a = ops::ImmutableConst(root, DT_DOUBLE, {2, 2}, kTestMemRegionName);
@@ -374,18 +434,16 @@ TEST_F(ConstantFoldingTest, TestImmutableConst) {
   auto c = ops::RandomGamma(root, {2, 2}, 2.0);
   auto result1 = ops::MatMul(root, a, b);
   auto result2 = ops::MatMul(root, result1, c);
-  TF_ASSERT_OK(root.ToGraph(g));
+  TF_ASSERT_OK(root.ToGraph(&g));
   TestTFEnvironment test_env;
   bool was_mutated;
-  Status status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  Status status = ConstantFold(ConstantFoldingOptions{}, nullptr,
+                               Env::Default(), nullptr, &g, &was_mutated);
   EXPECT_FALSE(was_mutated);
   EXPECT_FALSE(status.ok());
-  status = DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                       &test_env, nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, &test_env,
+                            nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
-  TF_EXPECT_OK(status);
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/debugger_state_interface.cc b/tensorflow/core/common_runtime/debugger_state_interface.cc
index 2e2fbcd7f402a001c25d0453e2d57ba0478e2c09..c1a92f9a2214131565a5a0a930781702147658bf 100644
--- a/tensorflow/core/common_runtime/debugger_state_interface.cc
+++ b/tensorflow/core/common_runtime/debugger_state_interface.cc
@@ -15,10 +15,43 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
 
+#include "tensorflow/core/lib/core/errors.h"
+
 namespace tensorflow {
 
+// static
 DebuggerStateFactory* DebuggerStateRegistry::factory_ = nullptr;
 
+// static
+DebugGraphDecoratorFactory* DebugGraphDecoratorRegistry::factory_ = nullptr;
+
+const string SummarizeDebugTensorWatches(
+    const protobuf::RepeatedPtrField<DebugTensorWatch>& watches) {
+  std::ostringstream oss;
+
+  for (const DebugTensorWatch& watch : watches) {
+    string tensor_name =
+        strings::StrCat(watch.node_name(), ":", watch.output_slot());
+    if (watch.tolerate_debug_op_creation_failures()) {
+      oss << "(TOL)";  // Shorthand for "tolerate".
+    }
+    oss << tensor_name << "|";
+
+    for (const string& debug_op : watch.debug_ops()) {
+      oss << debug_op << ",";
+    }
+
+    oss << "@";
+    for (const string& debug_url : watch.debug_urls()) {
+      oss << debug_url << ",";
+    }
+
+    oss << ";";
+  }
+
+  return oss.str();
+}
+
 // static
 void DebuggerStateRegistry::RegisterFactory(
     const DebuggerStateFactory& factory) {
@@ -27,11 +60,38 @@ void DebuggerStateRegistry::RegisterFactory(
 }
 
 // static
-std::unique_ptr<DebuggerStateInterface> DebuggerStateRegistry::CreateState(
-    const DebugOptions& debug_options) {
-  return (factory_ == nullptr || *factory_ == nullptr)
-             ? nullptr
-             : (*factory_)(debug_options);
+Status DebuggerStateRegistry::CreateState(
+    const DebugOptions& debug_options,
+    std::unique_ptr<DebuggerStateInterface>* state) {
+  if (factory_ == nullptr || *factory_ == nullptr) {
+    return errors::Internal(
+        "Creation of debugger state failed. "
+        "It appears that TFDBG is not linked in this TensorFlow build.");
+  } else {
+    *state = (*factory_)(debug_options);
+    return Status::OK();
+  }
+}
+
+// static
+void DebugGraphDecoratorRegistry::RegisterFactory(
+    const DebugGraphDecoratorFactory& factory) {
+  delete factory_;
+  factory_ = new DebugGraphDecoratorFactory(factory);
+}
+
+// static
+Status DebugGraphDecoratorRegistry::CreateDecorator(
+    const DebugOptions& options,
+    std::unique_ptr<DebugGraphDecoratorInterface>* decorator) {
+  if (factory_ == nullptr || *factory_ == nullptr) {
+    return errors::Internal(
+        "Creation of graph decorator failed. "
+        "It appears that TFDBG is not linked in this TensorFlow build.");
+  } else {
+    *decorator = (*factory_)(options);
+    return Status::OK();
+  }
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/debugger_state_interface.h b/tensorflow/core/common_runtime/debugger_state_interface.h
index fb72f9fa3ea0bfc86500691f345c1f3343c8b884..6f197f372faa3fd0bec8fe1e95daf538224ed3e9 100644
--- a/tensorflow/core/common_runtime/debugger_state_interface.h
+++ b/tensorflow/core/common_runtime/debugger_state_interface.h
@@ -18,28 +18,24 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
 
 namespace tensorflow {
 
-class DebugOptions;  // Defined in core/protobuf/debug.h.
-class Device;
-class Graph;
+// Returns a summary string for the list of debug tensor watches.
+const string SummarizeDebugTensorWatches(
+    const protobuf::RepeatedPtrField<DebugTensorWatch>& watches);
 
 // An abstract interface for storing and retrieving debugging information.
 class DebuggerStateInterface {
  public:
   virtual ~DebuggerStateInterface() {}
 
-  // Returns a summary string for RepeatedPtrFields of DebugTensorWatches.
-  virtual const string SummarizeDebugTensorWatches() = 0;
-
-  // Insert special-purpose debug nodes to graph and dump the graph for
-  // record. See the documentation of DebugNodeInserter::InsertNodes() for
-  // details.
-  virtual Status DecorateGraphForDebug(Graph* graph, Device* device) = 0;
-
   // Publish metadata about the debugged Session::Run() call.
   //
   // Args:
@@ -59,6 +55,19 @@ class DebuggerStateInterface {
       const std::vector<string>& target_nodes) = 0;
 };
 
+class DebugGraphDecoratorInterface {
+ public:
+  virtual ~DebugGraphDecoratorInterface() {}
+
+  // Insert special-purpose debug nodes to graph and dump the graph for
+  // record. See the documentation of DebugNodeInserter::InsertNodes() for
+  // details.
+  virtual Status DecorateGraph(Graph* graph, Device* device) = 0;
+
+  // Publish Graph to debug URLs.
+  virtual Status PublishGraph(const Graph& graph) = 0;
+};
+
 typedef std::function<std::unique_ptr<DebuggerStateInterface>(
     const DebugOptions& options)>
     DebuggerStateFactory;
@@ -74,11 +83,12 @@ class DebuggerStateRegistry {
   // implementation based on DebugOptions.
   static void RegisterFactory(const DebuggerStateFactory& factory);
 
-  // If RegisterFactory() has been called, creates and returns a concrete
+  // If RegisterFactory() has been called, creates and supplies a concrete
   // DebuggerStateInterface implementation using the registered factory,
-  // owned by the caller.  Otherwise returns nullptr.
-  static std::unique_ptr<DebuggerStateInterface> CreateState(
-      const DebugOptions& debug_options);
+  // owned by the caller and return an OK Status. Otherwise returns an error
+  // Status.
+  static Status CreateState(const DebugOptions& debug_options,
+                            std::unique_ptr<DebuggerStateInterface>* state);
 
  private:
   static DebuggerStateFactory* factory_;
@@ -86,6 +96,24 @@ class DebuggerStateRegistry {
   TF_DISALLOW_COPY_AND_ASSIGN(DebuggerStateRegistry);
 };
 
+typedef std::function<std::unique_ptr<DebugGraphDecoratorInterface>(
+    const DebugOptions& options)>
+    DebugGraphDecoratorFactory;
+
+class DebugGraphDecoratorRegistry {
+ public:
+  static void RegisterFactory(const DebugGraphDecoratorFactory& factory);
+
+  static Status CreateDecorator(
+      const DebugOptions& options,
+      std::unique_ptr<DebugGraphDecoratorInterface>* decorator);
+
+ private:
+  static DebugGraphDecoratorFactory* factory_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DebugGraphDecoratorRegistry);
+};
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/common_runtime/device.cc
index 78649afeb93aa7cb2231c35b9e69651b08ac6fb2..aa8a2d989bf5479254bb4b6fc5bdfb32e17c7325 100644
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/common_runtime/device.cc
@@ -23,8 +23,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-Device::Device(Env* env, const DeviceAttributes& device_attributes,
-               Allocator* device_allocator)
+Device::Device(Env* env, const DeviceAttributes& device_attributes)
     : DeviceBase(env), device_attributes_(device_attributes) {
   CHECK(DeviceNameUtils::ParseFullName(name(), &parsed_name_))
       << "Invalid device name: " << name();
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 07c6bdd6831923c02176206120df276fc180c985..c0e58f143e350ea9300c38b00adee9d423bdd64f 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -53,8 +53,7 @@ namespace tensorflow {
 
 class Device : public DeviceBase {
  public:
-  Device(Env* env, const DeviceAttributes& device_attributes,
-         Allocator* device_allocator);
+  Device(Env* env, const DeviceAttributes& device_attributes);
   ~Device() override;
 
   // Full name of this device (see top comment).
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 7807656cb25cb27a5472b4d120b22f529d552da7..31f12d4833793ef80646bd8936b50d4f6e812af1 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -29,10 +29,18 @@ DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
   for (Device* d : devices) {
     devices_.push_back(d);
 
-    // Register under both the full name and the local name.
+    // Register under the (1) full name, (2) canonical name, and (3) local name.
     string full_name = d->name();
     device_map_[CopyToBackingStore(full_name)] = d;
 
+    DeviceNameUtils::ParsedName parsed_name = d->parsed_name();
+    if (parsed_name.has_job && parsed_name.has_replica &&
+        parsed_name.has_task && parsed_name.has_type && parsed_name.has_id) {
+      string canonical_name = DeviceNameUtils::FullName(
+          parsed_name.job, parsed_name.replica, parsed_name.task,
+          parsed_name.type, parsed_name.id);
+      device_map_[CopyToBackingStore(canonical_name)] = d;
+    }
     string lname = DeviceNameUtils::LocalName(d->name());
     device_map_[CopyToBackingStore(lname)] = d;
     device_type_counts_[d->device_type()]++;
@@ -40,7 +48,8 @@ DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
 }
 
 DeviceMgr::~DeviceMgr() {
-  for (auto p : devices_) delete p;
+  // TODO(b/37437134): Remove destructor after converting to std::unique_ptr.
+  for (Device* p : devices_) delete p;
 }
 
 StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
@@ -85,6 +94,12 @@ Status DeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   Status s;
   auto iter = device_map_.find(name);
   if (iter == device_map_.end()) {
+    std::vector<StringPiece> device_names;
+    for (auto&& itr : device_map_) {
+      device_names.push_back(itr.first);
+    }
+    LOG(WARNING) << "Unknown device: " << name
+                 << " all devices: " << str_util::Join(device_names, ", ");
     return errors::InvalidArgument(name, " unknown device.");
   }
   *device = iter->second;
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index bb1ed726408b5d778517d8b76c224d2a070c69a3..d16681ac59d3bc34a54f63b8b55f372c661591b4 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -36,6 +36,7 @@ class DeviceMgr {
  public:
   // Takes ownership of each device in 'devices'.
   // TODO(zhifengc): Other initialization information.
+  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
   explicit DeviceMgr(const std::vector<Device*>& devices);
   ~DeviceMgr();
 
@@ -61,6 +62,7 @@ class DeviceMgr {
   int NumDeviceType(const string& type) const;
 
  private:
+  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
   typedef gtl::InlinedVector<Device*, 8> DeviceVec;
   DeviceVec devices_;
 
diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h
index b0540dfa95b3e3d34c5eef770236bdde695a7cd5..4cd56e583c09f70cd375e775eb2db9071871311f 100644
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@@ -39,7 +39,10 @@ class DeviceSet {
 
   // Set the device designated as the "client".  This device
   // must also be registered via AddDevice().
-  void set_client_device(Device* device) { client_device_ = device; }
+  void set_client_device(Device* device) {
+    DCHECK(client_device_ == nullptr);
+    client_device_ = device;
+  }
 
   // Returns a pointer to the device designated as the "client".
   Device* client_device() const { return client_device_; }
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index ff20ee94a7de317bbc04470de3d2f2adbc8747ac..0507076c8c3734083ac0ef7ffea0edebf180ad1a 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -27,8 +27,7 @@ namespace {
 static Device* Dev(const char* type, const char* name) {
   class FakeDevice : public Device {
    public:
-    explicit FakeDevice(const DeviceAttributes& attr)
-        : Device(nullptr, attr, nullptr) {}
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
     Status Sync() override { return Status::OK(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 002e246b80de41ed06c6f0a7584553920ed0293e..f1c17d778863563b49c433398a0c037db04ea4bd 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -370,6 +370,31 @@ Status DirectSession::Run(const NamedTensorList& inputs,
              &run_metadata);
 }
 
+Status DirectSession::CreateDebuggerState(
+    const DebugOptions& debug_options, int64 session_run_count,
+    int64 executor_step_count, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<string>& target_names,
+    std::unique_ptr<DebuggerStateInterface>* debugger_state) {
+  TF_RETURN_IF_ERROR(
+      DebuggerStateRegistry::CreateState(debug_options, debugger_state));
+  TF_RETURN_IF_ERROR(debugger_state->get()->PublishDebugMetadata(
+      debug_options.global_step(), session_run_count, executor_step_count,
+      input_names, output_names, target_names));
+  return Status::OK();
+}
+
+Status DirectSession::DecorateAndPublishGraphForDebug(
+    const DebugOptions& debug_options, Graph* graph, Device* device) {
+  std::unique_ptr<DebugGraphDecoratorInterface> decorator;
+  TF_RETURN_IF_ERROR(
+      DebugGraphDecoratorRegistry::CreateDecorator(debug_options, &decorator));
+
+  TF_RETURN_IF_ERROR(decorator->DecorateGraph(graph, device));
+  TF_RETURN_IF_ERROR(decorator->PublishGraph(*graph));
+  return Status::OK();
+}
+
 Status DirectSession::Run(const RunOptions& run_options,
                           const NamedTensorList& inputs,
                           const std::vector<string>& output_names,
@@ -402,27 +427,21 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   // Check if we already have an executor for these arguments.
   ExecutorsAndKeys* executors_and_keys;
-  RunStateArgs run_state_args;
+  RunStateArgs run_state_args(run_options.debug_options());
 
   Executor::Args args;
   args.step_id = step_id_counter_.fetch_add(1);
 
-  // EXPERIMENTAL: Options that allow the client to insert nodes into partition
-  // graphs for debugging.
-  if (!run_options.debug_options().debug_tensor_watch_opts().empty()) {
-    run_state_args.debugger_state =
-        DebuggerStateRegistry::CreateState(run_options.debug_options());
-  }
-
   TF_RETURN_IF_ERROR(
       GetOrCreateExecutors(pool, input_tensor_names, output_names, target_nodes,
                            &executors_and_keys, &run_state_args));
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
-  if (run_state_args.debugger_state) {
-    TF_RETURN_IF_ERROR(run_state_args.debugger_state->PublishDebugMetadata(
-        run_options.debug_options().global_step(), args.step_id,
-        executor_step_count, input_tensor_names, output_names, target_nodes));
+  std::unique_ptr<DebuggerStateInterface> debugger_state;
+  if (!run_options.debug_options().debug_tensor_watch_opts().empty()) {
+    TF_RETURN_IF_ERROR(CreateDebuggerState(
+        run_options.debug_options(), args.step_id, executor_step_count,
+        input_tensor_names, output_names, target_nodes, &debugger_state));
   }
 
   // Configure a call frame for the step, which we use to feed and
@@ -629,7 +648,9 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
 
   // Check if we already have an executor for these arguments.
   ExecutorsAndKeys* executors_and_keys;
-  RunStateArgs run_state_args;
+  // TODO(cais): TFDBG support for partial runs.
+  DebugOptions debug_options;
+  RunStateArgs run_state_args(debug_options);
   run_state_args.is_partial_run = true;
   TF_RETURN_IF_ERROR(GetOrCreateExecutors(pool, input_names, output_names,
                                           target_nodes, &executors_and_keys,
@@ -720,16 +741,21 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
       if (it == run_state->pending_inputs.end()) {
         return errors::InvalidArgument(
             "The feed ", input.first,
-            " has already been fed or was not specified in partial_run_setup.");
+            " was not specified in partial_run_setup.");
+      } else if (it->second) {
+        return errors::InvalidArgument("The feed ", input.first,
+                                       " has already been fed.");
       }
     }
     // Check that this is a new set of fetches that are still pending.
     for (const auto& output : output_names) {
       auto it = run_state->pending_outputs.find(output);
       if (it == run_state->pending_outputs.end()) {
+        return errors::InvalidArgument(
+            "The fetch ", output, " was not specified in partial_run_setup.");
+      } else if (it->second) {
         return errors::InvalidArgument("The fetch ", output,
-                                       " has already been fetched or was not "
-                                       "specified in partial_run_setup.");
+                                       " has already been fetched.");
       }
     }
   }
@@ -764,14 +790,15 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
                        << run_state->status;
         }
       }
-      for (const auto& it : inputs) {
-        run_state->pending_inputs.erase(it.first);
+      for (const auto& input : inputs) {
+        auto it = run_state->pending_inputs.find(input.first);
+        it->second = true;
       }
       for (const auto& name : output_names) {
-        run_state->pending_outputs.erase(name);
+        auto it = run_state->pending_outputs.find(name);
+        it->second = true;
       }
-      done = (run_state->pending_inputs.size() == 0 &&
-              run_state->pending_outputs.size() == 0);
+      done = run_state->PendingDone();
     }
     if (done) {
       WaitForNotification(run_state, cancellation_manager_,
@@ -900,11 +927,13 @@ Status DirectSession::CheckFetch(const NamedTensorList& feeds,
   std::unordered_set<TensorId, TensorId::Hasher> pending_feeds;
   {
     mutex_lock l(executor_lock_);
-    for (const string& feed : run_state->pending_inputs) {
-      TensorId id(ParseTensorName(feed));
+    for (const auto& input : run_state->pending_inputs) {
+      // Skip if the feed has already been fed.
+      if (input.second) continue;
+      TensorId id(ParseTensorName(input.first));
       auto it = name_to_node->find(id.first);
       if (it == name_to_node->end()) {
-        return errors::NotFound("Feed ", feed, ": not found");
+        return errors::NotFound("Feed ", input.first, ": not found");
       }
       pending_feeds.insert(id);
     }
@@ -952,14 +981,15 @@ Status DirectSession::GetOrCreateExecutors(
     thread::ThreadPool* pool, gtl::ArraySlice<string> inputs,
     gtl::ArraySlice<string> outputs, gtl::ArraySlice<string> target_nodes,
     ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args) {
-  string debug_tensor_watches_summary;
   int64 handle_name_counter_value = -1;
   if (LogMemory::IsEnabled() || run_state_args->is_partial_run) {
     handle_name_counter_value = handle_name_counter_.fetch_add(1);
   }
-  if (run_state_args->debugger_state) {
-    debug_tensor_watches_summary =
-        run_state_args->debugger_state->SummarizeDebugTensorWatches();
+
+  string debug_tensor_watches_summary;
+  if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) {
+    debug_tensor_watches_summary = SummarizeDebugTensorWatches(
+        run_state_args->debug_options.debug_tensor_watch_opts());
   }
 
   // Fast lookup path, no sorting.
@@ -1024,6 +1054,9 @@ Status DirectSession::GetOrCreateExecutors(
   options.fetch_endpoints = outputs_sorted;
   options.target_nodes = tn_sorted;
   options.use_function_convention = !run_state_args->is_partial_run;
+  if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) {
+    options.debug_options = run_state_args->debug_options;
+  }
 
   std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
 
@@ -1099,10 +1132,10 @@ Status DirectSession::GetOrCreateExecutors(
 
     optimizer.Optimize(lib, options_.env, device, &iter->second);
 
-    // EXPERIMENTAL: tfdbg inserts debug nodes (i.e., probes) to the graph
-    if (run_state_args->debugger_state) {
-      TF_RETURN_IF_ERROR(run_state_args->debugger_state->DecorateGraphForDebug(
-          partition_graph.get(), params.device));
+    // EXPERIMENTAL: tfdbg inserts debug nodes in the graph.
+    if (!options.debug_options.debug_tensor_watch_opts().empty()) {
+      TF_RETURN_IF_ERROR(DecorateAndPublishGraphForDebug(
+          options.debug_options, partition_graph.get(), params.device));
     }
 
     TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device->device_type()),
@@ -1351,10 +1384,10 @@ DirectSession::RunState::RunState(
       }) {
   // Initially all the feeds and fetches are pending.
   for (auto& name : pending_input_names) {
-    pending_inputs.emplace(name);
+    pending_inputs[name] = false;
   }
   for (auto& name : pending_output_names) {
-    pending_outputs.emplace(name);
+    pending_outputs[name] = false;
   }
 }
 
@@ -1372,6 +1405,16 @@ DirectSession::RunState::~RunState() {
   }
 }
 
+bool DirectSession::RunState::PendingDone() const {
+  for (const auto& it : pending_inputs) {
+    if (!it.second) return false;
+  }
+  for (const auto& it : pending_outputs) {
+    if (!it.second) return false;
+  }
+  return true;
+}
+
 void DirectSession::WaitForNotification(RunState* run_state,
                                         CancellationManager* cm,
                                         int64 timeout_in_ms) {
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 848ef3bc62d1e328ccd30599c7e029b20f516b21..cc298b3e57dbd662046c7e63de9b9a9d9ae1dcf7 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -151,8 +151,8 @@ class DirectSession : public Session {
     IntraProcessRendezvous* rendez = nullptr;
     std::unique_ptr<StepStatsCollector> collector;
     Notification executors_done;
-    std::unordered_set<string> pending_inputs;
-    std::unordered_set<string> pending_outputs;
+    std::unordered_map<string, bool> pending_inputs;   // true if fed
+    std::unordered_map<string, bool> pending_outputs;  // true if fetched
     TensorStore tensor_store;
     ScopedStepContainer step_container;
 
@@ -162,14 +162,19 @@ class DirectSession : public Session {
              const std::vector<string>& pending_output_names, int64 step_id,
              const std::vector<Device*>* devices);
 
+    // Returns true if all pending inputs and outputs have been completed.
+    bool PendingDone() const;
+
     ~RunState();
   };
 
   struct RunStateArgs {
+    RunStateArgs(const DebugOptions& options) : debug_options(options) {}
+
     bool is_partial_run = false;
     string handle;
     std::unique_ptr<Graph> graph;
-    std::unique_ptr<DebuggerStateInterface> debugger_state;
+    const DebugOptions& debug_options;
   };
 
   // Initializes the base execution state given the 'graph',
@@ -236,6 +241,16 @@ class DirectSession : public Session {
     return ::tensorflow::Status::OK();
   }
 
+  ::tensorflow::Status CreateDebuggerState(
+      const DebugOptions& debug_options, int64 session_run_count,
+      int64 executor_step_count, const std::vector<string>& input_names,
+      const std::vector<string>& output_names,
+      const std::vector<string>& target_names,
+      std::unique_ptr<DebuggerStateInterface>* debugger_state);
+
+  ::tensorflow::Status DecorateAndPublishGraphForDebug(
+      const DebugOptions& debug_options, Graph* graph, Device* device);
+
   const SessionOptions options_;
 
   // Device structures.
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index ed5b87f2f2203081178cf749575dc18382edc37b..9e18547af5cf9d8107585d33a391ec21decd9adf 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -92,31 +92,28 @@ bool SetTimelineLabel(const Node* node, NodeExecStats* node_stats) {
       }
     }
   }
-  const NodeDef& def = node->def();
-  string text = "";
+  const AttrSlice attrs = node->attrs();
+  string text;
   if (IsSend(node)) {
     string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
     string recv_device;
-    TF_CHECK_OK(GetNodeAttr(def, "recv_device", &recv_device));
-    text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
-                           tensor_name, " @", recv_device);
+    TF_CHECK_OK(GetNodeAttr(attrs, "recv_device", &recv_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", recv_device);
     is_transfer_node = true;
   } else if (IsRecv(node)) {
     string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
     string send_device;
-    TF_CHECK_OK(GetNodeAttr(def, "send_device", &send_device));
-    text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
-                           tensor_name, " @", send_device);
+    TF_CHECK_OK(GetNodeAttr(attrs, "send_device", &send_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", send_device);
     is_transfer_node = true;
   } else {
-    text = strings::StrCat(
-        memory, def.name(), " = ", def.op(), "(",
-        str_util::Join(
-            std::vector<StringPiece>(def.input().begin(), def.input().end()),
-            ", "),
-        ")");
+    text =
+        strings::StrCat(memory, node->name(), " = ", node->type_string(), "(",
+                        str_util::Join(node->requested_inputs(), ", "), ")");
   }
   node_stats->set_timeline_label(text);
   return is_transfer_node;
@@ -522,7 +519,7 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   EdgeInfo* dst_edge = item->output_edge_base();
   for (auto e : n->out_edges()) {
     dst_edge->dst_id = e->dst()->id();
-    CHECK_LE(e->src_output(), ((int32)0x3FFFFFFF));  // Must fit in 31 bits
+    CHECK_LE(e->src_output(), 0x3FFFFFFF);  // Must fit in 31 bits
     dst_edge->output_slot = e->src_output();
     dst_edge->is_last = false;
     const int output_slot = dst_edge->output_slot;
@@ -640,7 +637,7 @@ Status ExecutorImpl::Initialize() {
     Status s = params_.create_kernel(n->def(), &item->kernel);
     if (!s.ok()) {
       item->kernel = nullptr;
-      s = AttachDef(s, n->def());
+      s = AttachDef(s, *n);
       LOG(ERROR) << "Executor failed to create kernel. " << s;
       return s;
     }
@@ -668,7 +665,7 @@ Status ExecutorImpl::Initialize() {
     frame_info->nodes->push_back(n);
     if (IsEnter(n)) {
       string enter_name;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "frame_name", &enter_name));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &enter_name));
       EnsureFrameInfo(enter_name)->input_count++;
     }
   }
@@ -723,7 +720,7 @@ Status InferAllocAttr(const Node* n, const Node* dst,
   // so these two cases are not mutually exclusive.
   if (IsRecv(n)) {
     string src_name;
-    s = GetNodeAttr(n->def(), "send_device", &src_name);
+    s = GetNodeAttr(n->attrs(), "send_device", &src_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_src_name;
     if (!DeviceNameUtils::ParseFullName(src_name, &parsed_src_name)) {
@@ -748,7 +745,7 @@ Status InferAllocAttr(const Node* n, const Node* dst,
   }
   if (IsSend(dst)) {
     string dst_name;
-    s = GetNodeAttr(dst->def(), "recv_device", &dst_name);
+    s = GetNodeAttr(dst->attrs(), "recv_device", &dst_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_dst_name;
     if (!DeviceNameUtils::ParseFullName(dst_name, &parsed_dst_name)) {
@@ -1213,7 +1210,8 @@ class ExecutorState {
       GUARDED_BY(mu_);
 
   // The unique name of a frame.
-  inline string MakeFrameName(FrameState* frame, int64 iter_id, string name) {
+  inline string MakeFrameName(FrameState* frame, int64 iter_id,
+                              const string& name) {
     return strings::StrCat(frame->frame_name, ";", iter_id, ";", name);
   }
 
@@ -1360,7 +1358,7 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
     if (IsEnter(curr_node)) {
       // Enter a child frame.
       TF_RETURN_IF_ERROR(
-          GetNodeAttr(curr_node->def(), "frame_name", &frame_name));
+          GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
       parent = curr_node;
     } else if (IsExit(curr_node)) {
       // Exit to the parent frame.
@@ -1554,8 +1552,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
 
     if (vlog_) {
       VLOG(1) << "Process node: " << id << " step " << params.step_id << " "
-              << SummarizeNodeDef(node->def())
-              << " is dead: " << tagged_node.is_dead;
+              << SummarizeNode(*node) << " is dead: " << tagged_node.is_dead;
     }
 
     Entry* input_tensors = GetInputTensors(input_frame, input_iter);
@@ -1609,7 +1606,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
 
           if (vlog_) {
             VLOG(2) << this << " Async kernel done: "
-                    << SummarizeNodeDef(state->item->node->def());
+                    << SummarizeNode(*state->item->node);
           }
           if (stats) nodestats::SetOpEnd(stats);
           EntryVector outputs;
@@ -1810,7 +1807,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       // tensor value at i-th output.
       if (!IsSwitch(node) && !IsRecv(node)) {
         s.Update(errors::Internal("Missing ", i, "-th output from ",
-                                  SummarizeNodeDef(node->def())));
+                                  SummarizeNode(*node)));
       }
     } else {
       Entry* out = &((*outputs)[i]);
@@ -1877,7 +1874,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
                                   DataTypeString(dtype),
                                   " does not match declared output type ",
                                   DataTypeString(item.output_type(i)),
-                                  " for node ", SummarizeNodeDef(node->def())));
+                                  " for node ", SummarizeNode(*node)));
       }
     }
     if (!val.is_ref()) {
@@ -1914,7 +1911,7 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
         &impl_->gview_, input_iter, ready);
   } else if (item->is_enter) {
     bool is_constant;
-    Status s = GetNodeAttr(node->def(), "is_constant", &is_constant);
+    Status s = GetNodeAttr(node->attrs(), "is_constant", &is_constant);
     DCHECK(s.ok()) << s;
     FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
     output_iter = 0;
@@ -2240,7 +2237,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
                                            FrameState** child) {
   // Get the child frame name.
   string enter_name;
-  Status s = GetNodeAttr(node->def(), "frame_name", &enter_name);
+  Status s = GetNodeAttr(node->attrs(), "frame_name", &enter_name);
   DCHECK(s.ok()) << s;
   const string child_name = MakeFrameName(frame, iter, enter_name);
 
@@ -2258,7 +2255,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   if (vlog_) VLOG(2) << "Create frame: " << child_name;
 
   int parallel_iters;
-  s = GetNodeAttr(node->def(), "parallel_iterations", &parallel_iters);
+  s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
   DCHECK(s.ok()) << s;
   FrameState* temp = new FrameState(impl_, parallel_iters);
   temp->frame_name = child_name;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 0f2e24690f3aba56c474bbf8db3e3fce97b51844..996a8a9b3d4c0e035fef1a64bfec074c4bc2eb29 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -150,8 +150,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   ~FunctionLibraryRuntimeImpl() override;
 
-  Status Instantiate(const string& function_name,
-                     const InstantiateAttrValueMap& attrs,
+  Status Instantiate(const string& function_name, AttrSlice attrs,
                      Handle* handle) override;
 
   const FunctionBody* GetFunctionBody(Handle handle) override;
@@ -208,8 +207,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   };
   std::vector<Item*> items_;
 
-  Status FunctionDefToBody(const FunctionDef& fdef,
-                           const InstantiateAttrValueMap& attrs,
+  Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs,
                            FunctionBody** fbody);
   Status CreateItem(Handle handle, Item** item);
   Status GetOrCreateItem(Handle handle, Item** item);
@@ -324,7 +322,7 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
   // Try to instantiate this function for the func/attr. Maybe its
   // cached already.
   Handle handle;
-  TF_RETURN_IF_ERROR(Instantiate(ndef.op(), ndef.attr(), &handle));
+  TF_RETURN_IF_ERROR(Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
 
   const FunctionBody* fbody = GetFunctionBody(handle);
   CHECK_NOTNULL(fbody);
@@ -355,9 +353,9 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
   return s;
 }
 
-Status FunctionLibraryRuntimeImpl::FunctionDefToBody(
-    const FunctionDef& fdef, const InstantiateAttrValueMap& attrs,
-    FunctionBody** fbody) {
+Status FunctionLibraryRuntimeImpl::FunctionDefToBody(const FunctionDef& fdef,
+                                                     AttrSlice attrs,
+                                                     FunctionBody** fbody) {
   // Instantiates the function template into a graph def.
   InstantiationResult result;
   TF_RETURN_IF_ERROR(InstantiateFunction(fdef, attrs, get_func_sig_, &result));
@@ -390,11 +388,13 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
     // TODO(josh11b): Should filter out the attrs from func that aren't used
     // by the gradient function.
     TF_RETURN_IF_ERROR(creator(AttrSlice(&func.attr()), &grad_fdef));
-    TF_RETURN_IF_ERROR(FunctionDefToBody(grad_fdef, func.attr(), g_body));
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBody(grad_fdef, AttrSlice(&func.attr()), g_body));
   } else {
     // f is a user-defined function.
     Handle f_handle;
-    TF_RETURN_IF_ERROR(Instantiate(func.name(), func.attr(), &f_handle));
+    TF_RETURN_IF_ERROR(
+        Instantiate(func.name(), AttrSlice(&func.attr()), &f_handle));
     const FunctionBody* f_body = GetFunctionBody(f_handle);
     CHECK_NOTNULL(f_body);
     *g_body = SymbolicGradient(*f_body);
@@ -402,9 +402,9 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
   return Status::OK();
 }
 
-Status FunctionLibraryRuntimeImpl::Instantiate(
-    const string& function_name, const InstantiateAttrValueMap& attrs,
-    Handle* handle) {
+Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
+                                               AttrSlice attrs,
+                                               Handle* handle) {
   const string key = Canonicalize(function_name, attrs);
   {
     mutex_lock l(mu_);
@@ -417,7 +417,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
   Status s;
   FunctionBody* fbody = nullptr;
   if (function_name == kGradientOp) {
-    const AttrValue* f = gtl::FindOrNull(attrs, kFuncAttr);
+    const AttrValue* f = attrs.Find(kFuncAttr);
     if (f == nullptr) {
       return errors::InvalidArgument("SymbolicGradient is missing attr: f");
     }
@@ -427,7 +427,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
     }
     const string grad = lib_def_->FindGradient(func.name());
     if (!grad.empty()) {
-      return Instantiate(grad, func.attr(), handle);
+      return Instantiate(grad, AttrSlice(&func.attr()), handle);
     }
     TF_RETURN_IF_ERROR(InstantiateSymbolicGradient(func, &fbody));
   } else {
@@ -456,7 +456,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
 void DumpGraph(StringPiece label, const Graph* g) {
   // TODO(zhifengc): Change Graph to record #nodes.
   VLOG(1) << "Graph " << label << " #nodes " << g->num_nodes() << " #edges "
-          << g->edges().size();
+          << g->num_edges();
   if (VLOG_IS_ON(2)) {
     for (const auto& line : str_util::Split(DebugString(g), '\n')) {
       VLOG(2) << "|| " << line;
@@ -829,7 +829,8 @@ static bool ValidateInlining(const Node* node, const FunctionBody* fbody) {
 // Given a "caller" in "graph", which is a function call of a function
 // to "fbody". Replaces the "caller" with fbody->graph and connects
 // edges properly.
-static void InlineFunctionBody(Graph* g, Node* caller,
+static void InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
+                               Graph* g, Node* caller,
                                const FunctionBody* fbody) {
   if (!ValidateInlining(caller, fbody)) {
     LOG(WARNING) << "Inlining mismatch: " << caller->DebugString() << " vs. "
@@ -837,6 +838,23 @@ static void InlineFunctionBody(Graph* g, Node* caller,
     return;
   }
 
+  // Input edges. For data edges coming into "caller", we first compute the
+  // <src>:<src_output> for the i-th input in "inputs".
+  // If "caller" has any input control dependencies, we add a NoOp
+  // node "input_control_node", which depends on "caller"'s control inputs.
+  std::vector<Endpoint> inputs(caller->num_inputs());
+  Node* input_control_node = nullptr;
+  for (const Edge* e : caller->in_edges()) {
+    if (e->IsControlEdge()) {
+      if (input_control_node == nullptr) {
+        input_control_node = AddNoOp(g);
+      }
+      g->AddControlEdge(e->src(), input_control_node);
+    } else {
+      inputs[e->dst_input()] = {e->src(), e->src_output()};
+    }
+  }
+
   // Duplicate fbody->graph into 'g'.  First, we copy the nodes of
   // fbody->graph into 'g' except the source and sink nodes.  We copy
   // edges among nodes in 'fbody->graph'.
@@ -850,8 +868,35 @@ static void InlineFunctionBody(Graph* g, Node* caller,
     CHECK(n->IsOp());
     NodeDef ndef = n->def();
     ndef.set_name(strings::StrCat(caller->name(), "/", ndef.name()));
-    node_map[n->id()] = g->AddNode(ndef, &s);
+    Node* clone = g->AddNode(ndef, &s);
     TF_CHECK_OK(s);
+    node_map[n->id()] = clone;
+
+    // If there is an input control node, and one of:
+    // a) the node has no data or control inputs, or
+    // b) the node is a function call or SymbolicGradient,
+    // then add a control edge from the input control node to the clone.
+    //
+    // We must not execute any nodes if the original function call would not
+    // have executed. This is especially critical when the function call is
+    // inside a control-flow construct like tf.cond(). Case (a) ensures that
+    // such nodes do not run.
+    //
+    // The purpose of case (b) is to ensure that instances of case (a) created
+    // by further inlining steps also receive the control dependency.
+    if (input_control_node) {
+      bool has_inputs = false;
+      for (const Edge* e : n->in_edges()) {
+        if (!e->src()->IsSource()) {
+          has_inputs = true;
+          break;
+        }
+      }
+      if (!has_inputs || flib_def.Find(clone->type_string()) != nullptr ||
+          clone->type_string() == "SymbolicGradient") {
+        g->AddControlEdge(input_control_node, clone);
+      }
+    }
   }
   for (const Edge* e : fbody->graph->edges()) {
     if (e->src()->IsSource() || e->src()->IsSink() || e->dst()->IsSource() ||
@@ -865,29 +910,12 @@ static void InlineFunctionBody(Graph* g, Node* caller,
 
   // Connect input edges.
   //
-  // For data edges coming into "caller", we first compute the
-  // <src>:<src_output> for the i-th input in "inputs". We create one
-  // Identity node for each input. Then, we connect inputs[i] to to
-  // the i-th identity node added. The nodes that previously connects
-  // to the j-th output of i-th arg node are reconnected to th i-th
+  // We create one Identity node for each input. Then, we connect inputs[i] to
+  // the i-th identity node added. The nodes that previously connected
+  // to the j-th output of i-th arg node are reconnected to the i-th
   // identity node.
   //
-  // If "caller" has any input control dependencies, we add a NoOp
-  // node "input_control_node". This "input_control_node" depends on
-  // what "caller" depends on, and the added identity nodes depend on
-  // "input_control_node".
-  std::vector<Endpoint> inputs(caller->num_inputs());
-  Node* input_control_node = nullptr;
-  for (const Edge* e : caller->in_edges()) {
-    if (e->IsControlEdge()) {
-      if (input_control_node == nullptr) {
-        input_control_node = AddNoOp(g);
-      }
-      g->AddControlEdge(e->src(), input_control_node);
-    } else {
-      inputs[e->dst_input()] = {e->src(), e->src_output()};
-    }
-  }
+  // The added identity nodes depend on "input_control_node".
   for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
     Node* arg = node_map[fbody->arg_nodes[i]->id()];
     Node* n = AddIdentity(g, inputs[i]);
@@ -961,13 +989,12 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
   for (Node* node : graph->nodes()) {
     VLOG(3) << "Expanding " << node->DebugString();
     bool noinline;
-    if (fld->GetAttr(node->def(), kNoInlineAttr, &noinline).ok() && noinline) {
+    if (fld->GetAttr(*node, kNoInlineAttr, &noinline).ok() && noinline) {
       VLOG(3) << "noinline: " << node->DebugString();
       continue;
     }
     FunctionLibraryRuntime::Handle handle;
-    Status s =
-        lib->Instantiate(node->type_string(), node->def().attr(), &handle);
+    Status s = lib->Instantiate(node->type_string(), node->attrs(), &handle);
     if (!s.ok()) {
       // Either "node" is a primitive op, or the instantiation failed.
       if (errors::IsNotFound(s)) {
@@ -982,7 +1009,7 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
     candidates.push_back({node, fbody});
   }
   for (const auto& p : candidates) {
-    InlineFunctionBody(graph, p.first, p.second);
+    InlineFunctionBody(*fld, graph, p.first, p.second);
   }
   return !candidates.empty();
 }
@@ -1001,25 +1028,19 @@ string NewName(const Node* n, bool pretty) {
 void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
   // We visit nodes in forward topological sort order, which is a
   // possible execution order of the graph.
-  std::vector<size_t> pending(g->num_node_ids());
-  std::deque<const Node*> ready;
-  for (const Node* n : g->nodes()) {
-    pending[n->id()] = n->in_edges().size();
-    if (pending[n->id()] == 0) ready.push_back(n);
-  }
   gtl::InlinedVector<const Edge*, 4> inputs;
   gdef->Clear();
   gdef->mutable_versions()->CopyFrom(g->versions());
-  while (!ready.empty()) {
-    const Node* n = ready.front();
-    ready.pop_front();
-    for (const Edge* e : n->out_edges()) {
-      const Node* next = e->dst();
-      if (--pending[next->id()] == 0) {
-        ready.push_back(next);
-      }
+
+  std::vector<Node*> start_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->out_edges().empty()) {
+      start_nodes.push_back(n);
     }
-    if (!n->IsOp()) continue;
+  }
+
+  ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, pretty, &inputs](Node* n) {
+    if (!n->IsOp()) return;
     NodeDef* ndef = gdef->add_node();
     ndef->set_name(NewName(n, pretty));
     ndef->set_op(n->type_string());
@@ -1054,7 +1075,7 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
         ndef->add_input(strings::StrCat(srcname, ":", e->src_output()));
       }
     }
-  }
+  });
 }
 
 string DebugString(const Graph* g) {
@@ -1081,7 +1102,7 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
       continue;
     }
     int index;
-    TF_CHECK_OK(GetNodeAttr(n->def(), "index", &index));
+    TF_CHECK_OK(GetNodeAttr(n->attrs(), "index", &index));
     CHECK_LE(0, index);
     CHECK_LT(index, node_vec->size());
     (*node_vec)[index] = n;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index bbf35590eb6f082e1246ff9eed9502846669766a..e27fc3898dc9c16482ea6c45edf7c06090bf79f2 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
@@ -28,12 +32,15 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
+namespace {
 
 typedef FunctionDefHelper FDH;
 
@@ -52,19 +59,30 @@ void HasError(const Status& s, const string& substr) {
       << s << ", expected substring " << substr;
 }
 
+// A helper class to make AttrSlice from initializer lists
+class Attrs {
+ public:
+  Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
+        std::pair<string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
+  operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
+
+ private:
+  AttrValueMap map_;
+};
+
 class FunctionTest : public ::testing::Test {
  protected:
   FunctionTest()
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionTest() override {
-    delete exec_;
-    delete device_;
-  }
-
-  void Create(const FunctionDef& fdef, InstantiateAttrValueSlice attrs) {
-    delete exec_;
+  void Create(const FunctionDef& fdef, Attrs attrs) {
+    exec_ = nullptr;
     InstantiationResult result;
     TF_CHECK_OK(InstantiateFunction(fdef, attrs, GetOpSig, &result));
 
@@ -79,15 +97,18 @@ class FunctionTest : public ::testing::Test {
 
     const int version = g->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
-    TF_CHECK_OK(NewLocalExecutor(params, g, &exec_));
+    Executor* exec;
+    TF_CHECK_OK(NewLocalExecutor(params, g, &exec));
+    exec_.reset(exec);
   }
 
   void Run(const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
@@ -105,8 +126,8 @@ class FunctionTest : public ::testing::Test {
     }
   }
 
-  Device* device_ = nullptr;
-  Executor* exec_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<Executor> exec_;
   DataTypeVector arg_types_;
   DataTypeVector ret_types_;
 };
@@ -136,25 +157,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionLibraryRuntimeTest() override {
-    delete lib_;
-    delete lib_def_;
-    delete device_;
-  }
-
   void Init(const std::vector<FunctionDef>& flib) {
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
-    delete lib_def_;
-    lib_def_ = new FunctionLibraryDefinition(OpRegistry::Global(), proto);
-    delete lib_;
+    lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    lib_ = NewFunctionLibraryRuntime(nullptr, Env::Default(), device_,
-                                     TF_GRAPH_DEF_VERSION, lib_def_, opts);
+    lib_.reset(NewFunctionLibraryRuntime(nullptr, Env::Default(), device_.get(),
+                                         TF_GRAPH_DEF_VERSION, lib_def_.get(),
+                                         opts));
+    fdef_lib_ = lib_def_->ToProto();
   }
 
-  Status Run(const string& name, InstantiateAttrValueSlice attrs,
-             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
+  Status Run(const string& name, Attrs attrs, const std::vector<Tensor>& args,
+             std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
@@ -190,7 +205,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  Graph* GetFuncBody(const string& name, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetFuncBody(const string& name, Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
@@ -199,12 +214,12 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*fbody->graph, ret);
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*fbody->graph, ret.get());
     return ret;
   }
 
-  Graph* GetGradBody(const string& func, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetGradBody(const string& func, Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(func, attrs, &handle);
     if (!status.ok()) {
@@ -213,17 +228,17 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    FunctionBody* gbody = SymbolicGradient(*fbody);
+    std::unique_ptr<FunctionBody> gbody(SymbolicGradient(*fbody));
     CHECK_NOTNULL(gbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*gbody->graph, ret);
-    delete gbody;
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*gbody->graph, ret.get());
     return ret;
   }
 
-  Device* device_ = nullptr;
-  FunctionLibraryDefinition* lib_def_ = nullptr;
-  FunctionLibraryRuntime* lib_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<FunctionLibraryRuntime> lib_;
+  FunctionDefLibrary fdef_lib_;
 };
 
 TEST_F(FunctionLibraryRuntimeTest, IsStateful) {
@@ -254,113 +269,258 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({16, 32, 48, 64}));
 }
 
+// Adds a function call to 'scope.
+// TODO(phawkins): replace with C++ API for calling functions, when that exists.
+Output Call(Scope* scope, const string& op_name, const string& fn_name,
+            gtl::ArraySlice<Input> inputs) {
+  NodeDef def;
+  NodeDefBuilder builder(op_name, fn_name, scope->graph()->op_registry());
+  for (const Input& input : inputs) {
+    builder.Input(input.node()->name(), input.index(),
+                  input.node()->output_type(input.index()));
+  }
+  TF_CHECK_OK(builder.Finalize(&def));
+  Status status;
+  Node* n = scope->graph()->AddNode(def, &status);
+  TF_CHECK_OK(status);
+  for (int i = 0; i < inputs.size(); ++i) {
+    scope->graph()->AddEdge(inputs[i].node(), inputs[i].index(), n, i);
+  }
+  return Output(n);
+}
+
 TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  Graph* g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n2:float) -> (n4:float) {
-  n3 = XTimesFour[T=float](n2)
-  n4 = XTimesFour[T=float](n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-
-  ExpandInlineFunctions(lib_, g);
-  const char* e1 = R"P(
-(n2:float) -> (n17:float) {
-  n10 = Identity[T=float](n2)
-  n7 = XTimesTwo[T=float](n10)
-  n8 = XTimesTwo[T=float](n7)
-  n11 = Identity[T=float](n8)
-  n16 = Identity[T=float](n11)
-  n13 = XTimesTwo[T=float](n16)
-  n14 = XTimesTwo[T=float](n13)
-  n17 = Identity[T=float](n14)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g));
-
-  ExpandInlineFunctions(lib_, g);
-  const char* e2 = R"P(
-(n2:float) -> (n17:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n10 = Identity[T=float](n2)
-  n23 = Identity[T=float](n10)
-  n21 = Mul[T=float](n23, n19)
-  n24 = Identity[T=float](n21)
-  n30 = Identity[T=float](n24)
-  n28 = Mul[T=float](n30, n26)
-  n31 = Identity[T=float](n28)
-  n11 = Identity[T=float](n31)
-  n16 = Identity[T=float](n11)
-  n37 = Identity[T=float](n16)
-  n35 = Mul[T=float](n37, n33)
-  n38 = Identity[T=float](n35)
-  n44 = Identity[T=float](n38)
-  n42 = Mul[T=float](n44, n40)
-  n45 = Identity[T=float](n42)
-  n17 = Identity[T=float](n45)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g));
+
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto arg = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto a = Call(&s, "x4", "XTimesFour", {arg});
+    auto b = Call(&s, "y", "XTimesFour", {a});
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), b, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto x4_x2 = Call(&s, "x4/x2", "XTimesTwo", {func0});
+    auto x4_y = Call(&s, "x4/y", "XTimesTwo", {x4_x2});
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), x4_y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto y_x2 = Call(&s, "y/x2", "XTimesTwo", {func2});
+    auto y_y = Call(&s, "y/y", "XTimesTwo", {y_x2});
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), y_y);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  GraphDef e2;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), func0);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), func4, x4_x2_scale);
+    auto func5 = ops::Identity(s.WithOpName("Func/_5"), x4_x2_y);
+    auto func6 = ops::Identity(s.WithOpName("Func/_6"), func5);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), func6, x4_y_scale);
+    auto func7 = ops::Identity(s.WithOpName("Func/_7"), x4_y_y);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), func7);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto func8 = ops::Identity(s.WithOpName("Func/_8"), func2);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), func8, y_x2_scale);
+    auto func9 = ops::Identity(s.WithOpName("Func/_9"), y_x2_y);
+    auto func10 = ops::Identity(s.WithOpName("Func/_10"), func9);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), func10, y_y_scale);
+    auto func11 = ops::Identity(s.WithOpName("Func/_11"), y_y_y);
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), func11);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&e2));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // No further inlining.
-  ExpandInlineFunctions(lib_, g);
-  EXPECT_EQ(e2, DebugString(g));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // Get rid of redundant Identity nodes.
-  RemoveIdentityNodes(g);
-  const char* e3 = R"P(
-(n2:float) -> (n42:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n21 = Mul[T=float](n2, n19)
-  n28 = Mul[T=float](n21, n26)
-  n35 = Mul[T=float](n28, n33)
-  n42 = Mul[T=float](n35, n40)
+  RemoveIdentityNodes(g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_y_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, y_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, y_y_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
-)P";
-  EXPECT_EQ(e3, DebugString(g));
-  delete g;
+
+// Verifies that control dependencies on the caller are added as control
+// dependencies on any function calls created by inlining.
+TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
+  Init({test::function::XTimesTwo(), test::function::XTimesFour()});
+
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto b = Call(&s, "b", "XTimesFour", {a});
+    s.graph()->AddControlEdge(c.operation.node(), b.node());
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), b, 0);
+    TF_ASSERT_OK(s.ToGraph(g.get()));
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto func0 =
+        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+    auto b_x2 = Call(&s, "b/x2", "XTimesTwo", {func1});
+    s.graph()->AddControlEdge(func0.operation.node(), b_x2.node());
+    auto b_y = Call(&s, "b/y", "XTimesTwo", {b_x2});
+    s.graph()->AddControlEdge(func0.operation.node(), b_y.node());
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), b_y);
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto func0 =
+        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+
+    auto func3 =
+        ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies({func0}));
+    auto func4 = ops::Identity(
+        s.WithOpName("Func/_4").WithControlDependencies({func3}), func1);
+    auto b_x2_two = ops::Const(
+        s.WithOpName("b/x2/two").WithControlDependencies({func3}), 2LL);
+    auto b_x2_scale = ops::Cast(s.WithOpName("b/x2/scale"), b_x2_two, DT_FLOAT);
+    auto b_x2_y = ops::Mul(s.WithOpName("b/x2/y"), func4, b_x2_scale);
+    auto func5 = ops::Identity(s.WithOpName("Func/_5"), b_x2_y);
+
+    auto func6 =
+        ops::NoOp(s.WithOpName("Func/_6").WithControlDependencies({func0}));
+    auto func7 = ops::Identity(
+        s.WithOpName("Func/_7").WithControlDependencies({func6}), func5);
+    auto b_y_two = ops::Const(
+        s.WithOpName("b/y/two").WithControlDependencies({func6}), 2LL);
+    auto b_y_scale = ops::Cast(s.WithOpName("b/y/scale"), b_y_two, DT_FLOAT);
+    auto b_y_y = ops::Mul(s.WithOpName("b/y/y"), func7, b_y_scale);
+    auto func8 = ops::Identity(s.WithOpName("Func/_8"), b_y_y);
+
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func8);
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  std::unique_ptr<Graph> g(GetFuncBody("XTimes16", {{"T", DT_FLOAT}}));
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  ExpandInlineFunctions(lib_, g.get());
-  OptimizeGraph(lib_, &g);
-  const char* e0 = R"P(
-(n2:float) -> (n7:float) {
-  n8 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n4 = Mul[T=float](n2, n8)
-  n5 = Mul[T=float](n4, n8)
-  n6 = Mul[T=float](n5, n8)
-  n7 = Mul[T=float](n6, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_scale = ops::Const<float>(
+        s.WithOpName("x4/x2/scale/_12__cf__2")
+            .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+        2.0f);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_x2_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, x4_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, x4_x2_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
   auto func = FDH::Create(  // Creates a FunctionDef using NodeDefs
-      // Name
+                            // Name
       "ManySwapsNodeDef",
       // Input
       {"x: float", "y: float"},
@@ -379,9 +539,9 @@ TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
       // Return
       {{"o", "g:output"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsNodeDef", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsNodeDef", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
   const char* e0 = R"P(
 (n3:float, n2:float) -> (n3:float) {
 }
@@ -412,24 +572,35 @@ TEST_F(FunctionLibraryRuntimeTest, ControlDeps) {
        {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
       {{"o", "o:z:0"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsFirst", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsFirst", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
 
-  // NOTE: We can remove n8, n9, n10, n11 with a control edge n8->n5.
+  // NOTE: We can remove func0, func1, func2, func9 with a control edge n8->n5.
   // But we don't have a pass doing that.
-  const char* e0 = R"P(
-(n3:float, n2:float) -> (n6:float) {
-  n4 = Mul[T=float](n3, n3)
-  n8 = NoOp() @ n4
-  n9 = Identity[T=float](n3) @ n8
-  n10 = Identity[T=float](n2) @ n8
-  n11 = NoOp() @ n10, n9
-  n5 = Mul[T=float](n2, n2) @ n11
-  n6 = Add[T=float](n4, n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto x2 = ops::Mul(s.WithOpName("x2"), x, x);
+    auto func0 = ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies(x2));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), x);
+    auto func2 = ops::Identity(
+        s.WithOpName("Func/_2").WithControlDependencies({func0}), y);
+    auto func9 = ops::NoOp(s.WithOpName("Func/_9").WithControlDependencies(
+        {func1.output.op(), func2.output.op()}));
+    auto y2 =
+        ops::Mul(s.WithOpName("y2").WithControlDependencies({func9}), y, y);
+    auto o = ops::Add(s.WithOpName("o"), x2, y2);
+    auto ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Error_NotFound) {
@@ -459,13 +630,14 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
 
   // Instantiating "XTimesTwo" should fail.
   FunctionLibraryRuntime::Handle handle;
-  HasError(lib_->Instantiate("XTimesTwo", {{"T", DT_FLOAT}}, &handle),
+  HasError(lib_->Instantiate("XTimesTwo", Attrs({{"T", DT_FLOAT}}), &handle),
            "Not found: type attr not found");
 
   // But XTimesFour and XTimes16 instantiation should succeed. Only
   // when they run, they fail because XTimesTwo is bad.
-  TF_CHECK_OK(lib_->Instantiate("XTimesFour", {{"T", DT_FLOAT}}, &handle));
-  TF_CHECK_OK(lib_->Instantiate("XTimes16", {{"T", DT_FLOAT}}, &handle));
+  TF_CHECK_OK(
+      lib_->Instantiate("XTimesFour", Attrs({{"T", DT_FLOAT}}), &handle));
+  TF_CHECK_OK(lib_->Instantiate("XTimes16", Attrs({{"T", DT_FLOAT}}), &handle));
 
   auto x = test::AsTensor<float>({1, 2, 3, 4});
   Tensor y;
@@ -476,84 +648,136 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
 TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  auto f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
-  const char* e0 = R"P(
-(n4:float) -> (n5:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(f));
-  delete f;
-  std::unique_ptr<Graph> g(GetGradBody("XTimesTwo", {{"T", DT_FLOAT}}));
-  const char* e1 = R"P(
-(n4:float, n6:float) -> (n7:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-  n7 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Mul[T=float]](n4, n3, n6)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
-
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n2:float, n3:float) -> (n9:float) {
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [0] values: >]()
-  n10 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n6 = Shape[T=float, out_type=int32](n2)
-  n5 = Mul[T=float](n3, n10)
-  n7 = BroadcastGradientArgs[T=int32](n6, n11)
-  n8 = Sum[T=float, Tidx=int32, keep_dims=false](n5, n7)
-  n9 = Reshape[T=float, Tshape=int32](n8, n6)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+  std::unique_ptr<Graph> f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    f->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  std::unique_ptr<Graph> g = GetGradBody("XTimesTwo", {{"T", DT_FLOAT}});
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    NameAttrList fn;
+    fn.set_name("Mul");
+    (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto func1 = ops::SymbolicGradient(
+        s.WithOpName("Func/_1"), std::initializer_list<Input>{x, scale, func0},
+        {DT_FLOAT, DT_FLOAT}, fn);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1[0], 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto scale =
+        ops::Const(s.WithOpName("scale/_5__cf__6")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   2.0f);
+    auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
+    auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
+    auto const0 =
+        ops::Const(s.WithOpName("Func/_1/sy/_6__cf__7")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   0, {0});
+    auto func1_rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("Func/_1/rx"), func1_sx, const0);
+    auto func1_sum_gx =
+        ops::Sum(s.WithOpName("Func/_1/sum_gx"), func1_gx, func1_rx.r0);
+    auto func1_dx =
+        ops::Reshape(s.WithOpName("Func/_1/dx"), func1_sum_gx, func1_sx);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1_dx, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Add) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
-  const char* e0 = R"P(
-(n7:float, n5:float, n2:float) -> (n14:float, n11:float) {
-  n3 = Identity[T=float](n2)
-  n4 = Identity[T=float](n2)
-  n6 = Shape[T=float, out_type=int32](n5)
-  n8 = Shape[T=float, out_type=int32](n7)
-  n9 = BroadcastGradientArgs[T=int32](n8, n6)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n3, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n6)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Identity(s.WithOpName("gx"), dz);
+    auto gy = ops::Identity(s.WithOpName("gy"), dz);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Mul) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
-  const char* e0 = R"P(
-(n6:float, n3:float, n2:float) -> (n14:float, n11:float) {
-  n4 = Mul[T=float](n2, n3)
-  n5 = Shape[T=float, out_type=int32](n3)
-  n7 = Mul[T=float](n6, n2)
-  n8 = Shape[T=float, out_type=int32](n6)
-  n9 = BroadcastGradientArgs[T=int32](n8, n5)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n7, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n5)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Mul(s.WithOpName("gx"), dz, y);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto gy = ops::Mul(s.WithOpName("gy"), x, dz);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
@@ -570,108 +794,169 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
                           });
 
   // TestGrad = Test'(x, y)
-  auto grad =
-      FDH::Define("TestGrad", {"x:float", "y:float"}, {"dx:float", "dy:float"},
-                  {}, {FDH::Const<float>("dz", 1),
-                       {{"grad0", "grad1"},
-                        "SymbolicGradient",
-                        {"x", "y", "dz"},
-                        {
-                            {"f", FDH::FunctionRef("Test")},
-                            {"Tin", DataTypeSlice{T, T, T}},
-                            {"Tout", DataTypeSlice{T, T}},
-                        }},
-                       {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
-                       {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
+  auto grad = FDH::Define("TestGrad", {"x:float", "y:float"},
+                          {"dx:float", "dy:float"}, {},
+                          {FDH::Const<float>("dz", 1),
+                           {{"grad0", "grad1"},
+                            "SymbolicGradient",
+                            {"x", "y", "dz"},
+                            {
+                                {"f", FDH::FunctionRef("Test")},
+                                {"Tin", DataTypeSlice{T, T, T}},
+                                {"Tout", DataTypeSlice{T, T}},
+                            }},
+                           {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
+                           {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
 
   Init({test, grad});
 
-  std::unique_ptr<Graph> g(GetFuncBody("TestGrad", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("TestGrad", {});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n5 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Test](n4, n3, n2)
-  n6 = Identity[T=float](n5:1)
-  n8 = Identity[T=float](n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    NameAttrList fn;
+    fn.set_name("Test");
+    auto grad0 = ops::SymbolicGradient(s.WithOpName("grad0"),
+                                       std::initializer_list<Input>{x, y, dz},
+                                       {DT_FLOAT, DT_FLOAT}, fn);
+    auto dx = ops::Identity(s.WithOpName("dx"), grad0[0]);
+    auto dy = ops::Identity(s.WithOpName("dy"), grad0[1]);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 
-  ExpandInlineFunctions(lib_, g.get());
-  const char* e1 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n10 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n26 = Identity[T=float](n2)
-  n25 = Identity[T=float](n3)
-  n24 = Identity[T=float](n4)
-  n14 = Add[T=float](n24, n25)
-  n15 = Rank[T=float](n14)
-  n16 = Range[Tidx=int32](n11, n15, n10)
-  n20 = ZerosLike[T=int32](n15)
-  n17 = Sum[T=float, Tidx=int32, keep_dims=false](n14, n16)
-  n19 = SymbolicGradient[Tin={float, int32, float}, Tout={float, int32}, f=Sum[T=float, Tidx=int32, keep_dims=false]](n14, n16, n26)
-  n21 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Add[T=float]](n24, n25, n19)
-  n28 = Identity[T=float](n21:1)
-  n27 = Identity[T=float](n21)
-  n6 = Identity[T=float](n28)
-  n8 = Identity[T=float](n27)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
-
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n4:float, n3:float) -> (n25:float, n23:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n8 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n7 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n19 = Shape[T=float, out_type=int32](n3)
-  n9 = Add[T=float](n4, n3)
-  n20 = Shape[T=float, out_type=int32](n4)
-  n10 = Rank[T=float](n9)
-  n14 = Shape[T=float, out_type=int32](n9)
-  n21 = BroadcastGradientArgs[T=int32](n20, n19)
-  n11 = Range[Tidx=int32](n8, n10, n7)
-  n12 = Shape[T=int32, out_type=int32](n11)
-  n13 = Fill[T=int32](n12, n7)
-  n15 = DynamicStitch[N=2, T=int32](n11, n11, n14, n13)
-  n16 = Reshape[T=float, Tshape=int32](n2, n15)
-  n17 = Div[T=int32](n14, n15)
-  n18 = Tile[T=float, Tmultiples=int32](n16, n17)
-  n24 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21)
-  n22 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21:1)
-  n25 = Reshape[T=float, Tshape=int32](n24, n20)
-  n23 = Reshape[T=float, Tshape=int32](n22, n19)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), dz);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), func0, func1);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto grad0_l = ops::Sum(s.WithOpName("grad0/l"), grad0_z, grad0_indices);
+
+    NameAttrList sum;
+    sum.set_name("Sum");
+    (*sum.mutable_attr())["T"].set_type(DT_FLOAT);
+    (*sum.mutable_attr())["Tidx"].set_type(DT_INT32);
+    (*sum.mutable_attr())["keep_dims"].set_b(false);
+    auto grad0_func1 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_1"),
+        std::initializer_list<Input>{grad0_z, grad0_indices, func2},
+        {DT_FLOAT, DT_INT32}, sum);
+
+    auto grad0_func2 = ops::ZerosLike(s.WithOpName("grad0/Func/_2"), grad0_r);
+
+    NameAttrList add;
+    add.set_name("Add");
+    (*add.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto grad0_func3 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_3"),
+        std::initializer_list<Input>{func0, func1, grad0_func1[0]},
+        {DT_FLOAT, DT_FLOAT}, add);
+
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), grad0_func3[0]);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), grad0_func3[1]);
+    auto dx = ops::Identity(s.WithOpName("dx"), func3);
+    auto dy = ops::Identity(s.WithOpName("dy"), func4);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), x, y);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto i_shape =
+        ops::Shape(s.WithOpName("grad0/Func/_1/i_shape"), grad0_indices);
+    auto stitch_val = ops::Fill(s.WithOpName("grad0/Func/_1/stitch_val1"),
+                                i_shape, grad0_one);
+    auto x_shape = ops::Shape(s.WithOpName("grad0/Func/_1/x_shape"), grad0_z);
+    auto y_shape = ops::DynamicStitch(
+        s.WithOpName("grad0/Func/_1/y_shape"),
+        std::initializer_list<Input>{grad0_indices, grad0_indices},
+        std::initializer_list<Input>{x_shape, stitch_val});
+    auto dy_reshaped =
+        ops::Reshape(s.WithOpName("grad0/Func/_1/dy_reshaped"), dz, y_shape);
+    auto tile_scaling =
+        ops::Div(s.WithOpName("grad0/Func/_1/tile_scaling"), x_shape, y_shape);
+    auto func1_dx =
+        ops::Tile(s.WithOpName("grad0/Func/_1/dx"), dy_reshaped, tile_scaling);
+
+    auto sx = ops::Shape(s.WithOpName("grad0/Func/_3/sx"), x);
+    auto sy = ops::Shape(s.WithOpName("grad0/Func/_3/sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("grad0/Func/_3/rx"), sx, sy);
+    auto sum_gx =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gx"), func1_dx, rx.r0);
+    auto sum_gy =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gy"), func1_dx, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("grad0/Func/_3/dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("grad0/Func/_3/dy"), sum_gy, sy);
+
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 namespace {
 
 bool DoNothing(Graph* g) { return false; }
 
-string Optimize(const std::function<bool(Graph* g)>& pass,
-                const FunctionDef& fdef) {
+GraphDef Optimize(const std::function<bool(Graph* g)>& pass,
+                  const FunctionDef& fdef) {
   InstantiationResult result;
-  InstantiateAttrValueMap empty;
-  TF_CHECK_OK(InstantiateFunction(fdef, empty, GetOpSig, &result));
-  Graph* g = new Graph(OpRegistry::Global());
+  TF_CHECK_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   opts.expect_device_spec = false;
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g));
-  pass(g);
-  Graph* g1 = new Graph(OpRegistry::Global());
-  CopyGraph(*g, g1);
-  delete g;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g.get()));
+  pass(g.get());
+  std::unique_ptr<Graph> g1(new Graph(OpRegistry::Global()));
+  CopyGraph(*g, g1.get());
+  g = nullptr;
   GraphDef gdef;
   g1->ToGraphDef(&gdef);
-  delete g1;
-  return DebugString(gdef);
+  return gdef;
 }
 
 }  // end namespace
@@ -700,21 +985,25 @@ TEST(OptimizationTest, RemoveDeadNodes) {
        {{"keep_me"}, "RandomUniform", {"o"}, {{"T", T}, {"dtype", DT_FLOAT}}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(x:int32) -> (y:int32) {
-  o = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  keep_me = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](o)
-  x1 = Add[T=int32](o, o)
-  a = Square[T=int32](x)
-  y = Add[T=int32](a, o)
-  x2 = Mul[T=int32](a, x1)
-  x3 = Mul[T=int32](x1, x2)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto keep_me = ops::RandomUniform(s.WithOpName("keep_me"), {o}, DT_FLOAT);
+    auto x1 = ops::Add(s.WithOpName("x1"), o, o);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x2 = ops::Mul(s.WithOpName("x2"), a, x1);
+    auto x3 = ops::Mul(s.WithOpName("x3"), x1, x2);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
 
   // TODO(zhifengc): Comes up another test case.
-  EXPECT_EQ(Optimize(::tensorflow::RemoveDeadNodes, func), e0);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(::tensorflow::RemoveDeadNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
@@ -735,23 +1024,19 @@ TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
        {{"v_read"}, "Identity", {"v"}, {{"T", T}}},
        // returns v + v
        {{"ret"}, "Add", {"v_read", "v_read"}, {{"T", T}}}});
-  const char* e0 = R"S(
-() -> (ret:float) {
-  v = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  v_read = Identity[T=float](v)
-  ret = Add[T=float](v_read, v_read)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
-
-  const char* e1 = R"S(
-() -> (ret:float) {
-  v = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  v_read = Identity[T=float](v)
-  ret = Add[T=float](v_read, v_read)
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto v = ops::Variable(s.WithOpName("v"), PartialTensorShape({}), DT_FLOAT);
+    auto v_read = ops::Identity(s.WithOpName("v_read"), v);
+    auto ret = ops::Add(s.WithOpName("ret"), v_read, v_read);
+    auto ret_retval = ops::_Retval(s.WithOpName("ret_RetVal"), ret, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  TF_EXPECT_GRAPH_EQ(expected,
+                     Optimize(::tensorflow::RemoveIdentityNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes) {
@@ -782,28 +1067,38 @@ TEST(OptimizationTest, RemoveIdentityNodes) {
         {"x3"}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(x:int32) -> (y:int32) {
-  o = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  a = Square[T=int32](x)
-  y = Add[T=int32](a, o)
-  x1 = Identity[T=int32](a)
-  x2 = Identity[T=int32](x1)
-  x3 = Identity[T=int32](x2)
-  keep_me = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](o) @ x3
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
-
-  const char* e1 = R"S(
-(x:int32) -> (y:int32) {
-  o = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  a = Square[T=int32](x)
-  y = Add[T=int32](a, o)
-  keep_me = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](o) @ a
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x1 = ops::Identity(s.WithOpName("x1"), a);
+    auto x2 = ops::Identity(s.WithOpName("x2"), x1);
+    auto x3 = ops::Identity(s.WithOpName("x3"), x2);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(x3), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(a), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected,
+                       Optimize(::tensorflow::RemoveIdentityNodes, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter) {
@@ -840,49 +1135,63 @@ TEST(OptimizationTest, RemoveListArrayConverter) {
       // Return values
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(i:float) -> (o:float) {
-  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  s = Split[T=float, num_split=4](zero, i)
-  a = _ArrayToList[N=4, T=float, out_types={float, float, float, float}](s, s:1, s:2, s:3)
-  r = Mul[T=float](a:2, a:3)
-  l = Mul[T=float](a, a:1)
-  x = _ListToArray[N=2, T=float, Tin={float, float}](l, r)
-  o = AddN[N=2, T=float](x, x:1)
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
-
-  const char* e1 = R"P(
-(i:float) -> (o:float) {
-  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  s = Split[T=float, num_split=4](zero, i)
-  r = Mul[T=float](Func/_2, Func/_3)
-  l = Mul[T=float](Func/_0, Func/_1)
-  o = AddN[N=2, T=float](Func/_4, Func/_5)
-  Func/_0 = Identity[T=float](s)
-  Func/_1 = Identity[T=float](s:1)
-  Func/_2 = Identity[T=float](s:2)
-  Func/_3 = Identity[T=float](s:3)
-  Func/_4 = Identity[T=float](l)
-  Func/_5 = Identity[T=float](r)
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
-
-  const char* e2 = R"P(
-(i:float) -> (o:float) {
-  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  s = Split[T=float, num_split=4](zero, i)
-  r = Mul[T=float](s:2, s:3)
-  l = Mul[T=float](s, s:1)
-  o = AddN[N=2, T=float](l, r)
-}
-)P";
-  auto remove_listarray_and_identity = [](Graph* g) {
-    return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
-  };
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e2);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto a = ops::_ArrayToList(scope.WithOpName("a"), s.output,
+                               {DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT});
+    auto r = ops::Mul(scope.WithOpName("r"), a[2], a[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), a[0], a[1]);
+    auto x = ops::_ListToArray(scope.WithOpName("x"),
+                               std::initializer_list<Input>{l, r}, DT_FLOAT, 2);
+    auto o = ops::AddN(scope.WithOpName("o"), x.output);
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
+
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto func_0 = ops::Identity(scope.WithOpName("Func/_0"), s[0]);
+    auto func_1 = ops::Identity(scope.WithOpName("Func/_1"), s[1]);
+    auto func_2 = ops::Identity(scope.WithOpName("Func/_2"), s[2]);
+    auto func_3 = ops::Identity(scope.WithOpName("Func/_3"), s[3]);
+    auto r = ops::Mul(scope.WithOpName("r"), func_2, func_3);
+    auto l = ops::Mul(scope.WithOpName("l"), func_0, func_1);
+    auto func_4 = ops::Identity(scope.WithOpName("Func/_4"), l);
+    auto func_5 = ops::Identity(scope.WithOpName("Func/_5"), r);
+    auto o = ops::AddN(scope.WithOpName("o"),
+                       std::initializer_list<Input>{func_4, func_5});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
+  }
+
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto r = ops::Mul(scope.WithOpName("r"), s[2], s[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), s[0], s[1]);
+    auto o =
+        ops::AddN(scope.WithOpName("o"), std::initializer_list<Input>{l, r});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+    auto remove_listarray_and_identity = [](Graph* g) {
+      return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
+    };
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
@@ -911,33 +1220,48 @@ TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
         {"x"}}},
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(i:float) -> (o:float) {
-  dummy = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  x = _ListToArray[N=2, T=float, Tin={float, float}](i, i) @ dummy
-  o = AddN[N=2, T=float](x, x:1) @ x
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
-
-  const char* e1 = R"P(
-(i:float) -> (o:float) {
-  dummy = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  o = AddN[N=2, T=float](Func/_0, Func/_1) @ Func/_3
-  Func/_0 = Identity[T=float](i) @ Func/_2
-  Func/_1 = Identity[T=float](i) @ Func/_2
-  Func/_2 = NoOp() @ dummy
-  Func/_3 = NoOp() @ Func/_0, Func/_1
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto x = ops::_ListToArray(s.WithOpName("x").WithControlDependencies(dummy),
+                               std::initializer_list<Input>{i, i}, DT_FLOAT, 2);
+    auto o =
+        ops::AddN(s.WithOpName("o").WithControlDependencies({x.output[0].op()}),
+                  x.output);
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
+
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto func_2 =
+        ops::NoOp(s.WithOpName("Func/_2").WithControlDependencies(dummy));
+    auto func_0 = ops::Identity(
+        s.WithOpName("Func/_0").WithControlDependencies({func_2}), i);
+    auto func_1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func_2}), i);
+    auto func_3 = ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies(
+        {func_0.output.op(), func_1.output.op()}));
+    auto o = ops::AddN(s.WithOpName("o").WithControlDependencies({func_3}),
+                       std::initializer_list<Input>{func_0, func_1});
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
 
   auto remove_listarray_and_identity = [](Graph* g) {
     return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
   };
   // NOTE: We are not removing Identity nodes with any control
   // dependencies yet.
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e1);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
 }
 
+}  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index ce2cc604e4417e0d9378838883ec1b3a94dcb04f..e2ad18f33bdc452279aaec48898744b9bc151ceb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -179,10 +179,9 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                              int gpu_id, const string& physical_device_desc,
                              Allocator* gpu_allocator, Allocator* cpu_allocator,
                              bool sync_every_op, int32 max_streams)
-    : LocalDevice(options,
-                  Device::BuildDeviceAttributes(name, DEVICE_GPU, memory_limit,
-                                                locality, physical_device_desc),
-                  gpu_allocator),
+    : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
+                                                         memory_limit, locality,
+                                                         physical_device_desc)),
       gpu_allocator_(gpu_allocator),
       cpu_allocator_(cpu_allocator),
       gpu_id_(gpu_id),
@@ -465,6 +464,14 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                               DataTypeString(parsed.dtype()), " tensor");
     }
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
+
+    // If the tensor is not initialized, we likely ran out of memory.
+    if (!copy.IsInitialized()) {
+      return errors::ResourceExhausted(
+          "OOM when allocating tensor of shape ", parsed.shape().DebugString(),
+          " and type ", DataTypeString(parsed.dtype()));
+    }
+
     port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
     Notification n;
     device_contexts_[0]->CopyCPUTensorToDevice(&parsed, this, &copy,
@@ -560,15 +567,14 @@ int64 MinSystemMemory(int64 available_memory) {
   // We use the following heuristic for now:
   //
   // If the available_memory is < 2GiB, we allocate 200MiB to system memory.
-  // Otherwise, allocate 300MiB to system memory.
+  // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
   //
-  // In the future we could be more sophisticated by using a table of
-  // devices.
+  // In the future we could be more sophisticated by using a table of devices.
   if (available_memory < (1LL << 31)) {
     // 200MiB
     return 209715200LL;
   } else {
-    // max(300 MiB, 0.95 * available_memory)
+    // max(300 MiB, 0.05 * available_memory)
     return std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
   }
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc b/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
index b1be278ab4ff57b8163e3f0a8e056278a097215c..aaa25ad345e4eea508c163f738189e0215968db7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
@@ -82,7 +82,7 @@ class GPUTracerTest : public ::testing::Test {
   }
 
  protected:
-  void ExpectFailure(Status status, error::Code code) {
+  void ExpectFailure(const Status& status, error::Code code) {
     EXPECT_FALSE(status.ok());
     if (!status.ok()) {
       LOG(INFO) << "Status message: " << status.error_message();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 522db80d7fac25868785a9b3a77696964217245d..71f82ec9a1bc0d13cb72c63f08d0c6cb9c125f38 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -227,7 +227,7 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
     }
     // Since we want to use the memory from recv_stream in the
     // send_device_to_device_stream, add a dependency to make sure the memory is
-    // truely free.
+    // truly free.
     // TODO(zhengxq): remove this dependency when we switch to a better way
     // to make sure the memory is free.
     send_device_to_device_stream->ThenWaitFor(recv_stream);
@@ -322,7 +322,7 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
     done(errors::Internal("No send gpu copy-out-stream is available."));
     return;
   }
-  // Wait for the recv-stream to make sure the buffer is truely available.
+  // Wait for the recv-stream to make sure the buffer is truly available.
   recv_host_to_device_stream->ThenWaitFor(recv_stream);
 
   const int64 total_bytes = cpu_tensor->TotalBytes();
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index cee7b6d78adf6da38ae6cb3c655113a16b0b73f8..0e21e37fd3e720135b0be64971b9138314a8b04b 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -159,9 +159,36 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
   numa_node = 0;
   mutex_lock lock(mu_);
   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
-    Allocator* allocator =
-        new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/,
-                          new BasicCPUAllocator(), new NoopRounder, "cpu_pool");
+    bool use_bfc_allocator = false;
+    // TODO(reedwm): Switch default to BGFAllocator if it's at least as fast and
+    // efficient.
+    Status status = ReadBoolFromEnvVar("TF_CPU_ALLOCATOR_USE_BFC", false,
+                                       &use_bfc_allocator);
+    if (!status.ok()) {
+      LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
+    }
+    Allocator* allocator;
+    if (use_bfc_allocator) {
+      // TODO(reedwm): evaluate whether 64GB by default is the best choice.
+      int64 cpu_mem_limit_in_mb = -1;
+      Status status = ReadInt64FromEnvVar("TF_CPU_BFC_MEM_LIMIT_IN_MB",
+                                          1LL << 16 /*64GB max by default*/,
+                                          &cpu_mem_limit_in_mb);
+      if (!status.ok()) {
+        LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
+      }
+      int64 cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20);
+      allocator = new BFCAllocator(new BasicCPUAllocator(), cpu_mem_limit,
+                                   true /*allow_growth*/,
+                                   "bfc_cpu_allocator_for_gpu" /*name*/);
+      VLOG(2) << "Using BFCAllocator with memory limit of "
+              << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator";
+    } else {
+      allocator = new PoolAllocator(
+          100 /*pool_size_limit*/, true /*auto_resize*/,
+          new BasicCPUAllocator(), new NoopRounder, "cpu_pool");
+      VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator";
+    }
     if (LogMemory::IsEnabled()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 68496cb729249f82106c0b114bd5ef8f6d643ba7..edfecfae06e0bd02ce6b241b11786fb13c61a067 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/optimizer_cse.h"
 
@@ -56,7 +57,10 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
 
     if (opts_.do_constant_folding()) {
       ConstantFoldingOptions cf_opts;
-      if (DoConstantFolding(cf_opts, runtime, env, device, g)) {
+      bool was_mutated;
+      ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
+          .IgnoreError();
+      if (was_mutated) {
         RemoveDeadNodes(g);
         DumpGraph("ConstFolding", g);
         changed = true;
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index a85fbbf88ffe59439fdc9b68bb2afe4582e94d9a..74b2252c7c6a4530cce3ecf59294d1f2b8798933 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_runner.h"
 
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
diff --git a/tensorflow/core/common_runtime/graph_runner.h b/tensorflow/core/common_runtime/graph_runner.h
index 24e8b04c4635520e2fef383953e6aae60c926aad..1e4ae7722794ca527bcea023d992d92839ee46c9 100644
--- a/tensorflow/core/common_runtime/graph_runner.h
+++ b/tensorflow/core/common_runtime/graph_runner.h
@@ -20,17 +20,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 72bc37d43507ce52d1ed62d61c8cbf7b139752c1..4e14e6fe1a6204dd2e2dc63d28e5e1ca1de9c4d2 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -103,13 +103,13 @@ void Benchmark::Run(int iters) { RunWithArgs({}, {}, iters); }
 
 string GetRendezvousKey(const Node* node) {
   string send_device;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "send_device", &send_device));
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device", &send_device));
   string recv_device;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "recv_device", &recv_device));
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "recv_device", &recv_device));
   string tensor_name;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "tensor_name", &tensor_name));
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "tensor_name", &tensor_name));
   uint64 send_device_incarnation;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "send_device_incarnation",
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device_incarnation",
                           reinterpret_cast<int64*>(&send_device_incarnation)));
   return Rendezvous::CreateKey(send_device, send_device_incarnation,
                                recv_device, tensor_name, FrameAndIter(0, 0));
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 0a6342ed736d285a80db2ebef8fcf5e541000b6a..3f7c9f68dba6aa9a60edd145be064a626ff7a5bb 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -60,10 +60,8 @@ struct LocalDevice::EigenThreadPoolInfo {
 };
 
 LocalDevice::LocalDevice(const SessionOptions& options,
-                         const DeviceAttributes& attributes,
-                         Allocator* device_allocator)
-    : Device(options.env, attributes, device_allocator),
-      owned_tp_info_(nullptr) {
+                         const DeviceAttributes& attributes)
+    : Device(options.env, attributes), owned_tp_info_(nullptr) {
   // If we're running on the CPU, log warnings if we're not compiled using the
   // best flags for performance.
   port::WarnAboutUnusedCPUFeatures();
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index d1c27c6248143063e0204f98569da9f1b71042c5..84a4f66db4a2e749d78e97758739f95f5bddb14e 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -33,8 +33,8 @@ struct SessionOptions;
 // GPUDevice into more 'process-wide' abstractions.
 class LocalDevice : public Device {
  public:
-  LocalDevice(const SessionOptions& options, const DeviceAttributes& attributes,
-              Allocator* device_allocator);
+  LocalDevice(const SessionOptions& options,
+              const DeviceAttributes& attributes);
   ~LocalDevice() override;
 
  private:
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 80c483e70b0592041d510aeccbd799e81fd2e5c7..db053dd2fa0724f4377f20fe1616fcb31f3478cb 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/memory_types.h"
 
+#include <utility>
+
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -43,8 +45,8 @@ struct EndpointEq {
 };
 
 static Status ProcessMemoryTypes(
-    DeviceType device_type, const Graph* g,
-    std::function<Status(const Edge*, MemoryType, MemoryType)> fn) {
+    const DeviceType& device_type, const Graph* g,
+    const std::function<Status(const Edge*, MemoryType, MemoryType)>& fn) {
   if (device_type != DEVICE_GPU) {
     // On non-GPU devices, HOST_MEMORY and DEVICE_MEMORY are always
     // compatible.
@@ -88,17 +90,18 @@ static Status ProcessMemoryTypes(
   return Status::OK();
 }
 
-Status ValidateMemoryTypes(DeviceType device_type, const Graph* g) {
-  return ProcessMemoryTypes(device_type, g, [g](const Edge* e, MemoryType sm,
-                                                MemoryType dm) {
-    if (sm == dm) {
-      return Status::OK();
-    }
-    return errors::Internal(
-        "Memory type mismatch (", sm, " ", dm, ") between :", e->src()->id(),
-        ":", e->src_output(), " and ", e->dst()->id(), ":", e->dst_input(),
-        " : from ", e->src()->DebugString(), " to ", e->dst()->DebugString());
-  });
+Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g) {
+  return ProcessMemoryTypes(
+      device_type, g, [g](const Edge* e, MemoryType sm, MemoryType dm) {
+        if (sm == dm) {
+          return Status::OK();
+        }
+        return errors::Internal(
+            "Memory type mismatch (", sm, " ", dm,
+            ") between :", e->src()->id(), ":", e->src_output(), " and ",
+            e->dst()->id(), ":", e->dst_input(), " : from ",
+            e->src()->DebugString(), " to ", e->dst()->DebugString());
+      });
 }
 
 static Node* Send(Graph* g, const string& device_name, bool host,
@@ -132,8 +135,8 @@ static Node* Recv(Graph* g, const string& device_name, bool host,
   return ret;
 }
 
-Status EnsureMemoryTypes(DeviceType device_type, const string& device_name,
-                         Graph* g) {
+Status EnsureMemoryTypes(const DeviceType& device_type,
+                         const string& device_name, Graph* g) {
   struct Item {
     const Edge* edge;
     MemoryType sm;
@@ -185,7 +188,7 @@ Status EnsureMemoryTypes(DeviceType device_type, const string& device_name,
   return ValidateMemoryTypes(device_type, g);
 }
 
-Status MemoryTypeForOutput(DeviceType device_type, const Graph* g,
+Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
                            const Node* n, int index, MemoryType* memory_type) {
   MemoryTypeVector inp_mvec;
   MemoryTypeVector out_mvec;
diff --git a/tensorflow/core/common_runtime/memory_types.h b/tensorflow/core/common_runtime/memory_types.h
index ccbb8cffb17d99802df85fb502c5ac7c6ca604d4..fa0a7595f32ac8bb43010dcd3a407825ef79f618 100644
--- a/tensorflow/core/common_runtime/memory_types.h
+++ b/tensorflow/core/common_runtime/memory_types.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 // Returns an error iff *g running on a single device of 'device_type'
 // has memory type mismatch for any edge's source and destination.
-Status ValidateMemoryTypes(DeviceType device_type, const Graph* g);
+Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g);
 
 // Updates '*g' so that every edge's source and destination has
 // compatible memory types by inserting proper HostSend/Recv and
@@ -35,12 +35,12 @@ Status ValidateMemoryTypes(DeviceType device_type, const Graph* g);
 // Returns OK if '*g' is updated properly (ValidateMemoryTypes(g) must
 // be OK). Otherwise, returns an error and '*g' may be in an
 // invalidate state and the caller should discard it.
-Status EnsureMemoryTypes(DeviceType device_type, const string& device_name,
-                         Graph* g);
+Status EnsureMemoryTypes(const DeviceType& device_type,
+                         const string& device_name, Graph* g);
 
 // Get the memory type for 'index'th output of node 'n' in graph 'g', when
 // running on 'device_type'.
-Status MemoryTypeForOutput(DeviceType device_type, const Graph* g,
+Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
                            const Node* n, int index, MemoryType* memory_type);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
index ffbfbc74f16f5f0fb74c5a32ca23e57c84857022..bbd38a2e0775857f0a1b652c8814ee6d8e3b0821 100644
--- a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
+++ b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
@@ -49,11 +49,11 @@ class ParallelConcatRemovePass : public GraphOptimizationPass {
       }
     }
     for (Node* n : matches) {
-      AttrSlice n_attrs(n->def());
+      AttrSlice n_attrs = n->attrs();
       auto base_make_node = [n, g, &n_attrs](const string& op,
                                              const string& name) {
         NodeBuilder node_builder(name, op);
-        node_builder.Device(n->def().device());
+        node_builder.Device(n->requested_device());
         string colo;
         if (GetNodeAttr(n_attrs, "_class", &colo).ok()) {
           node_builder.Attr("_class", colo);
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa9713735edd05c36e1787be0e8c89e69c043fb2
--- /dev/null
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/renamed_device.h"
+
+namespace tensorflow {
+
+// TODO(saeta): Convert to returning a std::unique_ptr?
+/* static */
+Device* RenamedDevice::NewRenamedDevice(const string& new_base,
+                                        Device* underlying,
+                                        bool owns_underlying) {
+  DeviceNameUtils::ParsedName parsed_name;
+  CHECK(DeviceNameUtils::ParseFullName(new_base, &parsed_name));
+  DeviceNameUtils::ParsedName underlying_parsed_name =
+      underlying->parsed_name();
+  CHECK(underlying_parsed_name.has_type);
+  CHECK(underlying_parsed_name.has_id);
+  parsed_name.type = underlying_parsed_name.type;
+  parsed_name.id = underlying_parsed_name.id;
+  string name = DeviceNameUtils::FullName(parsed_name.job, parsed_name.replica,
+                                          parsed_name.task, parsed_name.type,
+                                          parsed_name.id);
+  DeviceAttributes attributes(underlying->attributes());
+  attributes.set_name(name);
+  return new RenamedDevice(underlying, attributes, owns_underlying);
+}
+
+RenamedDevice::RenamedDevice(Device* underlying,
+                             const DeviceAttributes& attributes,
+                             bool owns_underlying)
+    : Device(underlying->env(), attributes),
+      underlying_(underlying),
+      owns_underlying_(owns_underlying) {}
+
+RenamedDevice::~RenamedDevice() {
+  if (owns_underlying_) {
+    delete underlying_;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..0158e18cedc3b9b136258085641492c94de9e612
--- /dev/null
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -0,0 +1,119 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+// Wraps a device with a new name, delegating work to the wrapped device.
+//
+// This class is used to wrap local devices when using clusterspec propagation
+// where the name of a particular device may change in the context of a given
+// session.
+class RenamedDevice : public Device {
+ public:
+  static Device* NewRenamedDevice(const string& new_base, Device* underlying,
+                                  bool owns_underlying);
+  ~RenamedDevice() override;
+
+  // Below are virtual methods defined on DeviceBase
+  bool RequiresRecordingAccessedTensors() const override {
+    return underlying_->RequiresRecordingAccessedTensors();
+  }
+
+  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const override {
+    return underlying_->tensorflow_cpu_worker_threads();
+  }
+
+  const GpuDeviceInfo* tensorflow_gpu_device_info() const override {
+    return underlying_->tensorflow_gpu_device_info();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return underlying_->GetAllocator(attr);
+  }
+
+  Allocator* GetStepAllocator(AllocatorAttributes attr,
+                              ResourceMgr* step_resource_manager) override {
+    return underlying_->GetStepAllocator(attr, step_resource_manager);
+  }
+
+  const Eigen::ThreadPoolDevice* eigen_cpu_device() override {
+    return underlying_->eigen_cpu_device();
+  }
+
+#ifdef TENSORFLOW_USE_SYCL
+  const Eigen::SyclDevice* eigen_sycl_device() const override {
+    return underlying_->eigen_sycl_device();
+  }
+#endif
+
+  PerOpGpuDevice* MakeGpuDevice() override {
+    return underlying_->MakeGpuDevice();
+  }
+
+  void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+                             DeviceContext* dc, Allocator* allocator) override {
+    underlying_->ReinitializeGpuDevice(context, device, dc, allocator);
+  }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    return underlying_->MakeTensorFromProto(tensor_proto, alloc_attrs, tensor);
+  }
+
+  // Below are virtual methods defined on Device
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override {
+    underlying_->Compute(op_kernel, context);
+  }
+
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override {
+    underlying_->ComputeAsync(op_kernel, context, std::move(done));
+  }
+
+  void ConsumeListOfAccessedTensors(
+      DeviceContext* context, const TensorReferenceVector& tensors) override {
+    underlying_->ConsumeListOfAccessedTensors(context, tensors);
+  }
+
+  Status Sync() override { return underlying_->Sync(); }
+
+  Status MaybeRewriteGraph(const FunctionDefLibrary& library,
+                           std::unique_ptr<Graph>* graph) override {
+    return underlying_->MaybeRewriteGraph(library, graph);
+  }
+
+  Status FillContextMap(const Graph* graph,
+                        DeviceContextMap* device_context_map) override {
+    return underlying_->FillContextMap(graph, device_context_map);
+  }
+
+ private:
+  RenamedDevice(Device* underlying, const DeviceAttributes& attributes,
+                bool owns_underlying);
+  Device* const underlying_;
+  const bool owns_underlying_;
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc b/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc
index c179e94c36b2fd387c5d581748e336a4fa9959f4..b40924ef3a8618f1b132a243136653005eb6a93c 100644
--- a/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc
+++ b/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc
@@ -55,7 +55,7 @@ class ResourceVariableReadPass : public GraphOptimizationPass {
     }
     for (Node* read : matches) {
       DataType dtype;
-      TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(read->def()), "dtype", &dtype));
+      TF_RETURN_IF_ERROR(GetNodeAttr(read->attrs(), "dtype", &dtype));
       std::vector<Node*> in_control_edges;
       std::vector<std::pair<Node*, int>> in_edges;
       for (const Edge* edge : read->in_edges()) {
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index f2dff0bf75c71d80ab6fa536e3313e710055afe1..876f34b99118d2793acca12536fd2c2c6a0b328e 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -79,17 +80,15 @@ Status ShapeRefiner::AddNode(const Node* node) {
   // Get the shape function for this node
   const OpRegistrationData* op_reg_data;
   TF_RETURN_IF_ERROR(ops_registry_->LookUp(node->type_string(), &op_reg_data));
-  if (op_reg_data->shape_inference_fn == nullptr) {
+  if (op_reg_data->shape_inference_fn == nullptr &&
+      require_shape_inference_fns_) {
     return errors::InvalidArgument(
         "No shape inference function exists for op '", node->type_string(),
         "', did you forget to define it?");
   }
 
   // This needs to be filled in with real data in a second pass.
-  std::vector<const Tensor*> input_tensors(node->num_inputs());
-  std::vector<Tensor> real_tensors(node->num_inputs());
-  std::vector<bool> attempted_materialization(node->num_inputs());
-  std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());
+  std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
   std::vector<ShapeHandle> input_tensors_as_shapes;
 
   // Create the inference context for this node with the existing input shapes.
@@ -102,70 +101,7 @@ Status ShapeRefiner::AddNode(const Node* node) {
   }
 
   // Run the shape inference function, and return if there was an error.
-  TF_RETURN_IF_ERROR(c->Run(op_reg_data->shape_inference_fn));
-
-  // We must run the shape function repeatedly, in case users write
-  // shape functions where they only conditionally call input_tensor()
-  // based on the values of another input tensor.
-  bool rerun_shape_fn;
-  do {
-    // If the result of running shape inference would have benefitted
-    // from knowing the values of input tensors, try to materialize
-    // the results of those tensors, and then run the shape inference
-    // function again using those known tensors.
-    rerun_shape_fn = false;
-
-    // NOTE: It is possible to batch the extraction and
-    // materialization of inputs, instead of materializing one input
-    // at a time like we do below.  If input-at-a-time computation
-    // becomes a bottleneck, we could separate ExtractConstantSubgraph
-    // into two functions: one that returns true if an input is
-    // derivable from constants, and another function that extracts
-    // the subgraph for multiple target nodes and executes the whole
-    // subgraph once.
-
-    for (int i = 0; i < c->num_inputs(); ++i) {
-      if (!c->requested_input_tensor(i)) {
-        continue;
-      }
-      // Check if we have not already filled in the requested input,
-      // and if not, try to materialize the tensors.
-      if (!attempted_materialization[i]) {
-        attempted_materialization[i] = true;
-
-        Tensor result;
-        bool evaluated = false;
-        TF_RETURN_IF_ERROR(
-            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
-        if (evaluated) {
-          real_tensors[i] = result;
-          input_tensors[i] = &real_tensors[i];
-          // We have more concrete information about a shape,
-          // so re-run shape inference.
-          rerun_shape_fn = true;
-        }
-      }
-      if (c->requested_input_tensor_as_partial_shape(i) &&
-          !attempted_tensor_as_shape_conversion[i]) {
-        attempted_tensor_as_shape_conversion[i] = true;
-        if (i >= input_tensors_as_shapes.size()) {
-          input_tensors_as_shapes.resize(i + 1);
-        }
-        ShapeHandle s;
-        TF_RETURN_IF_ERROR(ConstantPartialShape(c.get(), node, i, &s));
-        input_tensors_as_shapes[i] = s;
-        rerun_shape_fn = true;
-      }
-    }
-
-    if (rerun_shape_fn) {
-      // We have more information about the shapes on this pass,
-      // so re-run shape inference.
-      c->set_input_tensors(input_tensors);
-      c->set_input_tensors_as_shapes(input_tensors_as_shapes);
-      TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(c.get()));
-    }
-  } while (rerun_shape_fn);
+  TF_RETURN_IF_ERROR(RunShapeFn(node, op_reg_data, c.get()));
 
   // Store the resulting InferenceContext object in the map.
   node_to_context_[node].swap(c);
@@ -201,6 +137,74 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
   return Status::OK();
 }
 
+Status ShapeRefiner::UpdateNode(const Node* node, bool* refined) {
+  auto it = node_to_context_.find(node);
+  if (it == node_to_context_.end()) {
+    *refined = true;
+    return AddNode(node);
+  }
+  InferenceContext* node_context = it->second.get();
+
+  // Give up if the context wasn't successfully built by the AddNode() method.
+  TF_RETURN_IF_ERROR(node_context->construction_status());
+
+  // Check if the shapes of the nodes in the fan-in of this node have changed,
+  // and if they have update the node input shapes.
+  for (const Edge* e : node->in_edges()) {
+    if (e->IsControlEdge()) continue;
+
+    Node* input = e->src();
+    auto iter = node_to_context_.find(input);
+    if (iter == node_to_context_.end()) {
+      return errors::FailedPrecondition(
+          "Input ", e->dst_input(), " ('", input->name(), "') for '",
+          node->name(), "' was not previously added to ShapeRefiner.");
+    }
+
+    InferenceContext* c = iter->second.get();
+    DCHECK_GE(e->dst_input(), 0);
+    if (node_context->MergeInput(e->dst_input(), c->output(e->src_output()))) {
+      *refined = true;
+    }
+
+    // Also propagate handle shape and dtype of edges which are carrying
+    // resource handles.
+    if (e->src()->output_type(e->src_output()) == DT_RESOURCE) {
+      if (node_context->set_input_handle_dtype(
+              e->dst_input(), c->output_handle_dtype(e->src_output()))) {
+        *refined = true;
+      }
+      if (node_context->MergeInputHandleShape(
+              e->dst_input(), c->output_handle_shape(e->src_output()))) {
+        *refined = true;
+      }
+    }
+  }
+
+  if (!*refined) {
+    // No input shape has changed, we're done
+    return Status::OK();
+  }
+
+  // Get and run the shape function for this node to update the shapes of the
+  // outputs.
+  const OpRegistrationData* op_reg_data;
+  TF_RETURN_IF_ERROR(ops_registry_->LookUp(node->type_string(), &op_reg_data));
+  if (op_reg_data->shape_inference_fn == nullptr &&
+      require_shape_inference_fns_) {
+    return errors::InvalidArgument(
+        "No shape inference function exists for op '", node->type_string(),
+        "', did you forget to define it?");
+  }
+
+  if (!op_reg_data->shape_inference_fn) {
+    // There is nothing more we can infer
+    return Status::OK();
+  }
+
+  return RunShapeFn(node, op_reg_data, node_context);
+}
+
 Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
                                                    int dst_idx, bool* evaluated,
                                                    Tensor* result) {
@@ -453,4 +457,93 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   return Status::OK();
 }
 
+Status ShapeRefiner::RunShapeFn(const Node* node,
+                                const OpRegistrationData* op_reg_data,
+                                shape_inference::InferenceContext* c) {
+  // This will be filled in with real data in a second pass.
+  std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+  std::vector<Tensor> real_tensors(node->num_inputs());
+  std::vector<bool> attempted_materialization(node->num_inputs());
+  std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());
+  std::vector<ShapeHandle> input_tensors_as_shapes;
+
+  // Run the shape inference function, and return if there was an error.
+  c->set_input_tensors(input_tensors);
+  c->set_input_tensors_as_shapes(input_tensors_as_shapes);
+  if (op_reg_data->shape_inference_fn) {
+    TF_RETURN_IF_ERROR(c->Run(op_reg_data->shape_inference_fn));
+  } else {
+    TF_RETURN_IF_ERROR(c->Run(shape_inference::UnknownShape));
+  }
+
+  // We must run the shape function repeatedly, in case users write
+  // shape functions where they only conditionally call input_tensor()
+  // based on the values of another input tensor.
+  bool rerun_shape_fn;
+  do {
+    // If the result of running shape inference would have benefitted
+    // from knowing the values of input tensors, try to materialize
+    // the results of those tensors, and then run the shape inference
+    // function again using those known tensors.
+    rerun_shape_fn = false;
+
+    // NOTE: It is possible to batch the extraction and
+    // materialization of inputs, instead of materializing one input
+    // at a time like we do below.  If input-at-a-time computation
+    // becomes a bottleneck, we could separate ExtractConstantSubgraph
+    // into two functions: one that returns true if an input is
+    // derivable from constants, and another function that extracts
+    // the subgraph for multiple target nodes and executes the whole
+    // subgraph once.
+
+    for (int i = 0; i < c->num_inputs(); ++i) {
+      if (!c->requested_input_tensor(i)) {
+        continue;
+      }
+      // Check if we have not already filled in the requested input,
+      // and if not, try to materialize the tensors.
+      if (!attempted_materialization[i]) {
+        attempted_materialization[i] = true;
+
+        Tensor result;
+        bool evaluated = false;
+        TF_RETURN_IF_ERROR(
+            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
+        if (evaluated) {
+          real_tensors[i] = result;
+          input_tensors[i] = &real_tensors[i];
+          // We have more concrete information about a shape,
+          // so re-run shape inference.
+          rerun_shape_fn = true;
+        }
+      }
+      if (c->requested_input_tensor_as_partial_shape(i) &&
+          !attempted_tensor_as_shape_conversion[i]) {
+        attempted_tensor_as_shape_conversion[i] = true;
+        if (i >= input_tensors_as_shapes.size()) {
+          input_tensors_as_shapes.resize(i + 1);
+        }
+        ShapeHandle s;
+        TF_RETURN_IF_ERROR(ConstantPartialShape(c, node, i, &s));
+        input_tensors_as_shapes[i] = s;
+        rerun_shape_fn = true;
+      }
+    }
+
+    if (rerun_shape_fn) {
+      // We have more information about the shapes on this pass,
+      // so re-run shape inference.
+      c->set_input_tensors(input_tensors);
+      c->set_input_tensors_as_shapes(input_tensors_as_shapes);
+      if (op_reg_data->shape_inference_fn) {
+        TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(c));
+      } else {
+        TF_RETURN_IF_ERROR(shape_inference::UnknownShape(c));
+      }
+    }
+  } while (rerun_shape_fn);
+
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index f23f9361eb0b988a9e0715c09f335e52d326343b..75eb81c346f4f6087a12383a0111d4fd38eee9dc 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -55,6 +55,11 @@ class ShapeRefiner {
   Status SetShape(const Node* node, int output_port,
                   shape_inference::ShapeHandle shape);
 
+  // Update the input shapes of node in case the shapes of the fan-ins of 'node'
+  // have themselves been modified (For example, in case of incremental shape
+  // refinement). Sets refined to true if any of the node shape has changed.
+  Status UpdateNode(const Node* node, bool* refined);
+
   // Returns the InferenceContext for 'node', if present.
   shape_inference::InferenceContext* GetContext(const Node* node) const {
     auto it = node_to_context_.find(node);
@@ -65,9 +70,13 @@ class ShapeRefiner {
   }
 
   // Getters and setters for graph_def_version_.
-  int32 graph_def_version() { return graph_def_version_; }
+  int32 graph_def_version() const { return graph_def_version_; }
   void set_graph_def_version(int32 version) { graph_def_version_ = version; }
 
+  void set_require_shape_inference_fns(bool require_shape_inference_fns) {
+    require_shape_inference_fns_ = require_shape_inference_fns;
+  }
+
  private:
   // Extracts the subgraph ending at 'node' that is statically
   // computable and inserts into 'out_graph'. If statically computable,
@@ -104,6 +113,9 @@ class ShapeRefiner {
                               const Node* node, int dst_idx,
                               shape_inference::ShapeHandle* result);
 
+  Status RunShapeFn(const Node* node, const OpRegistrationData* op_reg_data,
+                    shape_inference::InferenceContext* c);
+
   int32 graph_def_version_;
   const OpRegistryInterface* const ops_registry_;
 
@@ -129,6 +141,8 @@ class ShapeRefiner {
   static constexpr int64 kMaxTensorSize = 1024;
   std::unordered_map<string, Tensor> const_tensor_map_;
 
+  bool require_shape_inference_fns_ = true;
+
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeRefiner);
 };
 
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 05274ff311233b3eaf5ae86dd386298b89f7cd08..b8df6dd4f6203624eab6f14f89f9845f148be99e 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -126,6 +126,27 @@ TEST(ShapeRefinerTest, SetShape) {
   ASSERT_FALSE(m.SetShape(a.node(), 0, h).ok());
 }
 
+namespace {
+
+// An op with no shape function.
+REGISTER_OP("TestOpWithNoShapeFn").Input("a: int32").Output("o: int32");
+
+}  // namespace
+
+TEST(ShapeRefinerTest, MissingShapeInferenceFns) {
+  Scope root = Scope::NewRootScope();
+  auto a = ops::Const(root, 42);
+  Node* b;
+  TF_ASSERT_OK(NodeBuilder("b", "TestOpWithNoShapeFn")
+                   .Input(a.node())
+                   .Finalize(root.graph(), &b));
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(a.node()));
+  EXPECT_FALSE(m.AddNode(b).ok());
+  m.set_require_shape_inference_fns(false);
+  TF_EXPECT_OK(m.AddNode(b));
+}
+
 TEST(ShapeRefinerTest, PropagateConstants) {
   // Reduction dimension is a variable, so we don't know its value.
   // So the output shape value is unknown (though its rank is known).
@@ -747,5 +768,38 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
             m.AddNode(result).error_message());
 }
 
+TEST(ShapeRefinerTest, IncrementalUpdates) {
+  Scope root = Scope::NewRootScope();
+  Graph* g = root.graph();
+  Node* queue;
+  TF_CHECK_OK(NodeBuilder("queue", "FIFOQueueV2")
+                  .Attr("component_types", {DT_FLOAT})
+                  .Finalize(g, &queue));
+  Node* dequeue;
+  TF_CHECK_OK(NodeBuilder("dequeue", "QueueDequeueV2")
+                  .Attr("component_types", {DT_FLOAT})
+                  .Input(queue)
+                  .Finalize(g, &dequeue));
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(queue));
+  TF_ASSERT_OK(m.AddNode(dequeue));
+
+  // At this point, the shapes of the dequeued tensor are unknown.
+  shape_inference::InferenceContext* ctx = m.GetContext(dequeue);
+  EXPECT_EQ("?", ctx->DebugString(ctx->output(0)));
+
+  // Inject a shape, and incrementally propagate it to the dequeue op.
+  ctx = m.GetContext(queue);
+  shape_inference::ShapeHandle shp = ctx->MakeShape({3, 7});
+  ctx->set_output_handle_shape(0, shp);
+  ctx->set_output_handle_dtype(0, DT_FLOAT);
+
+  bool refined = false;
+  TF_ASSERT_OK(m.UpdateNode(dequeue, &refined));
+  EXPECT_TRUE(refined);
+  ctx = m.GetContext(dequeue);
+  EXPECT_EQ("[3,7]", ctx->DebugString(ctx->output(0)));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/simple_graph_execution_state.cc b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
index 31e63a9ef7502b24315834ecfed48c8a952546a0..3806f9f47f58ca60df812426ebbbc52da56a2070 100644
--- a/tensorflow/core/common_runtime/simple_graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/graph/validate.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -39,6 +37,13 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
+#ifndef IS_MOBILE_PLATFORM
+#include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#endif  // IS_MOBILE_PLATFORM
+
 namespace tensorflow {
 
 SimpleGraphExecutionState::SimpleGraphExecutionState(
@@ -231,9 +236,12 @@ Status SimpleGraphExecutionState::InitBaseGraph(
     const BuildGraphOptions& options) {
   const GraphDef* graph_def = &original_graph_def_;
 
+#ifndef IS_MOBILE_PLATFORM
   GraphDef optimized_graph;
+
   const RewriterConfig& rewrite_options =
       session_options_->config.graph_options().rewrite_options();
+
   if (grappler::MetaOptimizerEnabled(rewrite_options)) {
     // Adding this functionalty in steps. The first step is to make sure
     // we don't break dependencies. The second step will be to turn the
@@ -267,12 +275,20 @@ Status SimpleGraphExecutionState::InitBaseGraph(
     }
 
     if (s.ok()) {
-      s = grappler::RunMetaOptimizer(item, rewrite_options, &optimized_graph);
+      std::unordered_map<string, DeviceProperties> device_map;
+      for (const auto& device : device_set_->devices()) {
+        device_map[device->name()] =
+            grappler::GetDeviceInfo(device->parsed_name());
+      }
+      grappler::VirtualCluster cluster(device_map);
+      s = grappler::RunMetaOptimizer(item, rewrite_options, &cluster,
+                                     &optimized_graph);
     }
     if (s.ok()) {
       graph_def = &optimized_graph;
     }
   }
+#endif  // IS_MOBILE_PLATFORM
 
   std::unique_ptr<Graph> new_graph(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
diff --git a/tensorflow/core/common_runtime/simple_placer.cc b/tensorflow/core/common_runtime/simple_placer.cc
index f6e6bf069257031c84fe2106340d9375652b98a6..ae225e8b35d462b2b41806d0896f83fe4c046f6e 100644
--- a/tensorflow/core/common_runtime/simple_placer.cc
+++ b/tensorflow/core/common_runtime/simple_placer.cc
@@ -34,6 +34,11 @@ namespace tensorflow {
 
 namespace {
 
+// We hoist the conversion from C-style string literal to StringPiece here,
+// so that we can avoid the many repeated calls to strlen().
+const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
+
 // Returns a list of devices sorted by preferred type and then name
 // from 'devices' whose type is in 'supported_device_types'.  This
 // function searches the device types in 'supported_device_types' and
@@ -71,24 +76,26 @@ void ColocationGroups(const Node& node,
   std::vector<string> class_specs;
   // TODO(vrv): We should consider adding a GetNodeAttr that returns a
   // StringPiece, to avoid a copy.
-  Status s = GetNodeAttr(node.def(), kColocationAttrName, &class_specs);
-  if (!s.ok()) {
+  if (!GetNodeAttrSimple(node.attrs(), kColocationAttrNameStringPiece,
+                         &class_specs)) {
     // No attribute value is equivalent to the empty colocation_group.
-    *colocation_groups = {strings::StrCat(kColocationGroupPrefix, node.name())};
+    *colocation_groups = {
+        strings::StrCat(kColocationGroupPrefixStringPiece, node.name())};
     return;
   }
 
   bool found_spec = false;
   for (const string& class_spec : class_specs) {
     StringPiece spec(class_spec);
-    if (spec.Consume(kColocationGroupPrefix)) {
+    if (spec.Consume(kColocationGroupPrefixStringPiece)) {
       found_spec = true;
       colocation_groups->emplace_back(class_spec);
     }
   }
 
   if (!found_spec) {
-    *colocation_groups = {strings::StrCat(kColocationGroupPrefix, node.name())};
+    *colocation_groups = {
+        strings::StrCat(kColocationGroupPrefixStringPiece, node.name())};
   }
 }
 
@@ -322,7 +329,7 @@ class ColocationGraph {
         AddDebugInfo(node_root, &debug_info);
 
         DeviceNameUtils::ParsedName specified_device_name;
-        if (DeviceNameUtils::ParseFullName(node->def().device(),
+        if (DeviceNameUtils::ParseFullName(node->requested_device(),
                                            &specified_device_name) &&
             specified_device_name == members_[node_root].device_name) {
           // The specified device and merged set device match, and
@@ -341,28 +348,27 @@ class ColocationGraph {
             std::sort(device_names.begin(), device_names.end());
 
             return errors::InvalidArgument(
-                "Could not satisfy explicit device specification '",
-                node->def().device(),
-                "' because no devices matching that specification "
-                "are registered in this process; available devices: ",
-                str_util::Join(device_names, ", "), debug_info);
+                "Operation was explicitly assigned to ",
+                node->requested_device(), " but available devices are [ ",
+                str_util::Join(device_names, ", "), " ]. Make sure ",
+                "the device specification refers to a valid device.");
           } else if (specified_device_name.has_type) {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
-                node->def().device(), "' because no supported kernel for ",
+                node->requested_device(), "' because no supported kernel for ",
                 specified_device_name.type, " devices is available.",
                 debug_info);
           } else {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
-                node->def().device(), debug_info);
+                node->requested_device(), debug_info);
           }
         } else {
           // The specified device may be a valid device but the
           // merged set device is different, so print both.
           return errors::InvalidArgument(
               "Could not satisfy explicit device specification '",
-              node->def().device(),
+              node->requested_device(),
               "' because the node was colocated with a group of nodes that "
               "required incompatible device '",
               DeviceNameUtils::ParsedNameToString(
@@ -507,7 +513,7 @@ class ColocationGraph {
       return errors::Internal("Assigned device '", node.assigned_device_name(),
                               "' does not have registered OpKernel support "
                               "for ",
-                              node.def().op());
+                              node.type_string());
     } else {
       // This node has not yet been assigned to a device, so we
       // calculate any constraints due to the set of registered
@@ -521,25 +527,25 @@ class ColocationGraph {
           registered_device_types.insert(d->device_type());
         }
         return errors::InvalidArgument(
-            "No OpKernel was registered to support Op '", node.def().op(),
+            "No OpKernel was registered to support Op '", node.type_string(),
             "' with these attrs.  Registered devices: [",
             str_util::Join(registered_device_types, ","),
             "], Registered kernels:\n",
-            KernelsRegisteredForOp(node.def().op()));
+            KernelsRegisteredForOp(node.type_string()));
       }
 
       // If the NodeDef contains a device, then we interpret it as a
       // (partial) device specification.
-      if (!node.def().device().empty()) {
+      if (!node.requested_device().empty()) {
         // The user has specified a device in the NodeDef, try to find a
         // valid device matching their specification in the set of
         // devices.
         // NOTE: The full name may specify a device that is not in
         // n.supported_device_types(), but we check that in AssignDevice().
-        if (!DeviceNameUtils::ParseFullName(node.def().device(),
+        if (!DeviceNameUtils::ParseFullName(node.requested_device(),
                                             &member->device_name)) {
           return errors::InvalidArgument("Malformed device specification '",
-                                         node.def().device(), "'");
+                                         node.requested_device(), "'");
         }
       }
     }
@@ -638,7 +644,7 @@ Status SimplePlacer::Run() {
       continue;
     }
     status = colocation_graph.AddNode(*node);
-    if (!status.ok()) return AttachDef(status, node->def());
+    if (!status.ok()) return AttachDef(status, *node);
   }
 
   // 2. Enumerate the constraint edges, and use them to update the disjoint
@@ -701,7 +707,7 @@ Status SimplePlacer::Run() {
                                "be on the same device), but the two nodes "
                                "were assigned two different devices: ",
                                status.error_message()),
-                           node->def());
+                           *node);
         }
       }
     }
@@ -741,9 +747,9 @@ Status SimplePlacer::Run() {
     status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
-          errors::InvalidArgument("Cannot assign a device to node '",
+          errors::InvalidArgument("Cannot assign a device for operation '",
                                   node->name(), "': ", status.error_message()),
-          node->def());
+          *node);
     }
 
     // Returns the first device in sorted devices list so we will always
@@ -783,9 +789,9 @@ Status SimplePlacer::Run() {
     status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
-          errors::InvalidArgument("Cannot assign a device to node '",
+          errors::InvalidArgument("Cannot assign a device for operation '",
                                   node->name(), "': ", status.error_message()),
-          node->def());
+          *node);
     }
 
     string assigned_device = devices[0]->name();
@@ -801,7 +807,7 @@ Status SimplePlacer::Run() {
             return e->dst()->assigned_device_name() == output_device_name;
           });
 
-      if (consumers_on_same_device && 
+      if (consumers_on_same_device &&
           CanAssignToDevice(output_device_name, devices)) {
         assigned_device = output_device_name;
       }
diff --git a/tensorflow/core/common_runtime/simple_placer_test.cc b/tensorflow/core/common_runtime/simple_placer_test.cc
index c73ed041ed19aa34325c1c35c0c1647f86c87179..69ed58b33c10e860dcfc16e488115fc5cec47aac 100644
--- a/tensorflow/core/common_runtime/simple_placer_test.cc
+++ b/tensorflow/core/common_runtime/simple_placer_test.cc
@@ -66,7 +66,7 @@ class DummyOp : public OpKernel {
 class FakeDevice : public Device {
  private:
   explicit FakeDevice(const DeviceAttributes& device_attributes)
-      : Device(nullptr, device_attributes, nullptr) {}
+      : Device(nullptr, device_attributes) {}
 
  public:
   Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
@@ -237,7 +237,7 @@ class SimplePlacerTest : public ::testing::Test {
 
   Status ReferenceTestHelper(const string& variable_op_type,
                              const string& assign_op_type,
-                             DeviceType expected_device_type);
+                             const DeviceType& expected_device_type);
 };
 
 #define EXPECT_COLOCATED(g, name_a, name_b)                         \
@@ -500,9 +500,9 @@ TEST_F(SimplePlacerTest, TestAssignedGpuDeviceToCpuDevice) {
 // Build a graph containing a Variable op of "variable_op_type" and an
 // Assign op of "assign_op_type", and expect all of the ops to be
 // placed on a device of type "expected_device_type".
-Status SimplePlacerTest::ReferenceTestHelper(const string& variable_op_type,
-                                             const string& assign_op_type,
-                                             DeviceType expected_device_type) {
+Status SimplePlacerTest::ReferenceTestHelper(
+    const string& variable_op_type, const string& assign_op_type,
+    const DeviceType& expected_device_type) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -939,10 +939,7 @@ TEST_F(SimplePlacerTest, TestUnknownDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Could not satisfy explicit device specification '/job:foo'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
 }
 
 // Test that placement fails when the combination of partial
@@ -957,10 +954,7 @@ TEST_F(SimplePlacerTest, TestUnknownMergedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Could not satisfy explicit device specification '/job:foo'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
 }
 
 // Test that placement fails when the previously-assigned device for a
@@ -1107,10 +1101,7 @@ TEST_F(SimplePlacerTest, TestNonexistentGpuNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Could not satisfy explicit "
-                            "device specification "
-                            "'/device:fakegpu:11'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakegpu:11"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1127,10 +1118,7 @@ TEST_F(SimplePlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Could not satisfy explicit "
-                            "device specification "
-                            "'/device:fakecpu:0'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakecpu:0"));
   EXPECT_TRUE(
       StringPiece(s.error_message())
           .contains("no supported kernel for fakecpu devices is available"));
@@ -1151,12 +1139,9 @@ TEST_F(SimplePlacerTest, TestNonExistentDevice) {
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Could not satisfy explicit device specification "
-                    "'/job:foo/replica:17' "
-                    "because no devices matching that specification are "
-                    "registered in this process"));
+  EXPECT_TRUE(StringPiece(s.error_message())
+                  .contains("was explicitly assigned to /job:foo/replica:17 "
+                            "but available devices"));
 }
 
 TEST_F(SimplePlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 60348e885f585ab84c6ba85dce262a5d1b924726..f5f8aab694698dab6151fcce7ed5c28da43ec36a 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -38,10 +38,8 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
                                    const string& name, Bytes memory_limit,
                                    const DeviceLocality& locality,
                                    Allocator* allocator)
-    : LocalDevice(options,
-                  Device::BuildDeviceAttributes(name, DEVICE_CPU, memory_limit,
-                                                locality),
-                  allocator),
+    : LocalDevice(options, Device::BuildDeviceAttributes(
+                               name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator) {}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
index c83e4a4e3a1913c04257ee6036d447a9833b0d2f..8edf922d11ee1662b78771bfdc7c38e0144aee19 100644
--- a/tensorflow/core/common_runtime/visitable_allocator.h
+++ b/tensorflow/core/common_runtime/visitable_allocator.h
@@ -44,7 +44,7 @@ class VisitableAllocator : public Allocator {
 };
 
 // Needed for cases when a VisitableAllocator gets wrapped for tracking.
-// Multiple-inheritance is considered acceptible in this case because
+// Multiple-inheritance is considered acceptable in this case because
 // VisitableAllocator is a pure virtual interface and only TrackingAllocator
 // has default implementation.
 class TrackingVisitableAllocator : public TrackingAllocator,
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 372ddb168a5cc76d5021ed78e3a6fce8421ff440..2fc49d4412e9cf75dfa7a1e6c4c57cc0cade874a 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -58,7 +58,7 @@ cc_library(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
-        ":debug_graph_utils",
+        ":debugger_state_impl",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:debug_ops_op_lib",
     ],
@@ -85,6 +85,19 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
+tf_cuda_library(
+    name = "debugger_state_impl",
+    srcs = ["debugger_state_impl.cc"],
+    hdrs = ["debugger_state_impl.h"],
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        ":debug_graph_utils",
+        ":debug_io_utils",
+    ],
+    alwayslink = 1,
+)
+
 tf_cuda_library(
     name = "debug_graph_utils",
     srcs = ["debug_graph_utils.cc"],
@@ -92,7 +105,6 @@ tf_cuda_library(
     copts = tf_copts(),
     linkstatic = 1,
     deps = [
-        ":debug_io_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -135,6 +147,7 @@ tf_cuda_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "@grpc//:grpc++_unsecure",
     ],
     alwayslink = 1,
@@ -209,6 +222,31 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "grpc_session_debug_test",
+    size = "medium",
+    srcs = ["grpc_session_debug_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["nomac"],  # b/38276817
+    deps = [
+        ":debug_grpc_testlib",
+        ":debug_io_utils",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_session",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
+        "//tensorflow/core/kernels:constant_op",
+        "//tensorflow/core/kernels:matmul_op",
+    ],
+)
+
 # TODO(cais): Add the following back in when tfdbg is supported on Android.
 # filegroup(
 #     name = "android_srcs",
diff --git a/tensorflow/core/debug/debug.cc b/tensorflow/core/debug/debug.cc
index c293b285c353cacedbf6264715d871ba553db59f..1aedfc2710e2024fa86abcaf9d33712bd516c847 100644
--- a/tensorflow/core/debug/debug.cc
+++ b/tensorflow/core/debug/debug.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
-#include "tensorflow/core/debug/debug_graph_utils.h"
+#include "tensorflow/core/debug/debugger_state_impl.h"
 
 namespace tensorflow {
 namespace {
@@ -30,10 +30,18 @@ class DebuggerStateRegistration {
     return std::unique_ptr<DebuggerStateInterface>(new DebuggerState(options));
   }
 
+  static std::unique_ptr<DebugGraphDecoratorInterface>
+  CreateDebugGraphDecorator(const DebugOptions& options) {
+    return std::unique_ptr<DebugGraphDecoratorInterface>(
+        new DebugGraphDecorator(options));
+  }
+
   DebuggerStateRegistration() {
     DebuggerStateRegistry::RegisterFactory(CreateDebuggerState);
+    DebugGraphDecoratorRegistry::RegisterFactory(CreateDebugGraphDecorator);
   }
 };
+
 static DebuggerStateRegistration register_debugger_state_implementation;
 
 }  // end namespace
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
index 24b9dd799aa168e5f1a7e29cd793ce6c16579f46..1031ea843ed7874e2490714714cc2ce6abe09a66 100644
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/debug/debug_gateway.h"
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/session_factory.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -56,11 +58,11 @@ DebugGateway::~DebugGateway() {
 }
 
 void DebugGateway::SetNodeCompletionCallback(NodeCompletionCallback callback) {
-  comp_cb_ = callback;
+  comp_cb_ = std::move(callback);
 }
 
 void DebugGateway::SetNodeValueCallback(NodeValueCallback callback) {
-  val_cb_ = callback;
+  val_cb_ = std::move(callback);
 }
 
 void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 9061fd39f5b3b74ed66b323e2a0e9a0dcc466339..f8f3d2ae506064aef109c344f726609f295c94c4 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_graph_utils.h"
 
 #include "tensorflow/core/common_runtime/memory_types.h"
-#include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -45,69 +44,6 @@ Status ParseBoolString(const string& bool_str, bool* bool_val) {
 
 }  // namespace
 
-DebuggerState::DebuggerState(const DebugOptions& debug_options)
-    : watches(debug_options.debug_tensor_watch_opts()), debug_urls_() {
-  for (const DebugTensorWatch& watch : watches) {
-    for (const string& url : watch.debug_urls()) {
-      debug_urls_.insert(url);
-    }
-  }
-}
-
-DebuggerState::~DebuggerState() {
-  for (const string& debug_url : debug_urls_) {
-    DebugIO::CloseDebugURL(debug_url).IgnoreError();
-  }
-}
-
-const string DebuggerState::SummarizeDebugTensorWatches() {
-  std::ostringstream oss;
-
-  for (const DebugTensorWatch& watch : watches) {
-    string tensor_name =
-        strings::StrCat(watch.node_name(), ":", watch.output_slot());
-    if (watch.tolerate_debug_op_creation_failures()) {
-      oss << "(TOL)";  // Shorthand for "tolerate".
-    }
-    oss << tensor_name << "|";
-
-    for (const string& debug_op : watch.debug_ops()) {
-      oss << debug_op << ",";
-    }
-
-    oss << "@";
-    for (const string& debug_url : watch.debug_urls()) {
-      oss << debug_url << ",";
-    }
-
-    oss << ";";
-  }
-
-  return oss.str();
-}
-
-Status DebuggerState::DecorateGraphForDebug(Graph* graph, Device* device) {
-  Status status;
-
-  DebugNodeInserter::DeparallelizeWhileLoops(graph, device);
-  status.Update(DebugNodeInserter::InsertNodes(watches, graph, device));
-  if (status.ok()) {
-    status.Update(DebugIO::PublishGraph(*graph, debug_urls_));
-  }
-
-  return status;
-}
-
-Status DebuggerState::PublishDebugMetadata(
-    const int64 global_step, const int64 session_run_count,
-    const int64 executor_step_count, const std::vector<string>& input_names,
-    const std::vector<string>& output_names,
-    const std::vector<string>& target_nodes) {
-  return DebugIO::PublishDebugMetadata(global_step, session_run_count,
-                                       executor_step_count, input_names,
-                                       output_names, target_nodes, debug_urls_);
-}
-
 // static
 Status DebugNodeInserter::InsertNodes(
     const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
@@ -287,19 +223,16 @@ Status DebugNodeInserter::InsertNodes(
 void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
   for (Node* node : graph->nodes()) {
     if (node->IsEnter()) {
-      for (const auto& attr : node->def().attr()) {
-        if (attr.first == "parallel_iterations") {
-          if (attr.second.i() > 1) {
-            LOG(INFO) << "For debugging, tfdbg is changing the "
-                      << "parallel_iterations attribute of the Enter/RefEnter "
-                      << "node \"" << node->name() << "\" on device \""
-                      << device->name() << "\" from " << attr.second.i()
-                      << " to 1. (This does not affect subsequent non-debug "
-                      << "runs.)";
-            node->AddAttr<int64>("parallel_iterations", 1);
-          }
-          break;
-        }
+      const AttrValue* parallel_iterations =
+          node->attrs().Find("parallel_iterations");
+      if (parallel_iterations && parallel_iterations->i() > 1) {
+        LOG(INFO) << "For debugging, tfdbg is changing the "
+                  << "parallel_iterations attribute of the Enter/RefEnter "
+                  << "node \"" << node->name() << "\" on device \""
+                  << device->name() << "\" from " << parallel_iterations->i()
+                  << " to 1. (This does not affect subsequent non-debug "
+                  << "runs.)";
+        node->AddAttr<int64>("parallel_iterations", 1);
       }
     }
   }
diff --git a/tensorflow/core/debug/debug_graph_utils.h b/tensorflow/core/debug/debug_graph_utils.h
index ac97856443df1b3609db41e97dadd8a8d105ae73..fa8b33b98ab03b4c30b574962306844d7ee945e7 100644
--- a/tensorflow/core/debug/debug_graph_utils.h
+++ b/tensorflow/core/debug/debug_graph_utils.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_DEBUG_NODE_INSERTER_H_
 
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
@@ -29,35 +28,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-class DebuggerState : public DebuggerStateInterface {
- public:
-  DebuggerState(const DebugOptions& debug_options);
-  virtual ~DebuggerState();
-
-  // Returns a summary string for RepeatedPtrFields of DebugTensorWatches.
-  const string SummarizeDebugTensorWatches() override;
-
-  // Insert special-purpose debug nodes to graph. See the documentation of
-  // DebugNodeInserter::InsertNodes() for details.
-  Status DecorateGraphForDebug(Graph* graph, Device* device) override;
-
-  const protobuf::RepeatedPtrField<DebugTensorWatch>& watches;
-
-  // Publish metadata about the debugged Session::Run() call.
-  //
-  // See the doc string of DebuggerStateInterface::PublishDebugMetadata() for
-  // details.
-  Status PublishDebugMetadata(const int64 global_step,
-                              const int64 session_run_count,
-                              const int64 executor_step_count,
-                              const std::vector<string>& input_names,
-                              const std::vector<string>& output_names,
-                              const std::vector<string>& target_names) override;
-
- private:
-  std::unordered_set<string> debug_urls_;
-};
-
 class DebugNodeInserter {
  public:
   // EXPERIMENTAL: Insert special debug ops (e.g., DebugIdentity) to graph for
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index 8dfa90448998be3410a558f49cfae43173f6ecda..d9fab87aed1a8fe24bbf6237374afe7fa1a26282 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/debug/debug_graph_utils.h"
 #include "tensorflow/core/debug/debug_io_utils.h"
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
@@ -33,26 +34,32 @@ namespace test {
   Event event;
 
   while (stream->Read(&event)) {
-    const Summary::Value& val = event.summary().value(0);
-
-    std::vector<string> name_items =
-        tensorflow::str_util::Split(val.node_name(), ':');
-
-    const string node_name = name_items[0];
-    int32 output_slot = 0;
-    tensorflow::strings::safe_strto32(name_items[1], &output_slot);
-    const string debug_op = name_items[2];
-
-    const TensorProto& tensor_proto = val.tensor();
-    Tensor tensor(tensor_proto.dtype());
-    if (!tensor.FromProto(tensor_proto)) {
-      return ::grpc::Status::CANCELLED;
+    if (event.has_log_message()) {
+      debug_metadata_strings.push_back(event.log_message().message());
+    } else if (!event.graph_def().empty()) {
+      encoded_graph_defs.push_back(event.graph_def());
+    } else if (event.has_summary()) {
+      const Summary::Value& val = event.summary().value(0);
+
+      std::vector<string> name_items =
+          tensorflow::str_util::Split(val.node_name(), ':');
+
+      const string node_name = name_items[0];
+      int32 output_slot = 0;
+      tensorflow::strings::safe_strto32(name_items[1], &output_slot);
+      const string debug_op = name_items[2];
+
+      const TensorProto& tensor_proto = val.tensor();
+      Tensor tensor(tensor_proto.dtype());
+      if (!tensor.FromProto(tensor_proto)) {
+        return ::grpc::Status::CANCELLED;
+      }
+
+      node_names.push_back(node_name);
+      output_slots.push_back(output_slot);
+      debug_ops.push_back(debug_op);
+      debug_tensors.push_back(tensor);
     }
-
-    node_names.push_back(node_name);
-    output_slots.push_back(output_slot);
-    debug_ops.push_back(debug_op);
-    debug_tensors.push_back(tensor);
   }
 
   {
@@ -79,6 +86,8 @@ namespace test {
 }
 
 void TestEventListenerImpl::ClearReceivedDebugData() {
+  debug_metadata_strings.clear();
+  encoded_graph_defs.clear();
   node_names.clear();
   output_slots.clear();
   debug_ops.clear();
diff --git a/tensorflow/core/debug/debug_grpc_testlib.h b/tensorflow/core/debug/debug_grpc_testlib.h
index 0e3223dbfe17cb84b3239a0d1611fd1a83944f0a..c2b96e78c5648a1c32f341ffbf6829c85a46f88a 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.h
+++ b/tensorflow/core/debug/debug_grpc_testlib.h
@@ -47,6 +47,8 @@ class TestEventListenerImpl final : public EventListener::Service {
                                              const int32 output_slot,
                                              const string& debug_op);
 
+  std::vector<string> debug_metadata_strings;
+  std::vector<string> encoded_graph_defs;
   std::vector<string> node_names;
   std::vector<int32> output_slots;
   std::vector<string> debug_ops;
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index a79ea1b45d724732e4629ee43c39a8c4870b37ef..c8282b83d6a2dfcf81ce0d01c971f3b4a53525a5 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -233,7 +233,7 @@ Status DebugIO::PublishDebugTensor(const string& tensor_name,
         strings::StrCat("Failed to parse tensor name: \"", tensor_name, "\""));
   }
 
-  int num_failed_urls = 0;
+  int32 num_failed_urls = 0;
   std::vector<Status> fail_statuses;
   for (const string& url : debug_urls) {
     if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
@@ -497,18 +497,26 @@ Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
 
 #if defined(PLATFORM_GOOGLE)
 DebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr)
-    : url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)),
-      ctx_(),
-      channel_(::grpc::CreateCustomChannel(server_stream_addr,
-                                           ::grpc::InsecureChannelCredentials(),
-                                           ::grpc::ChannelArguments())),
-      stub_(EventListener::NewStub(channel_)),
-      reader_writer_(stub_->SendEvents(&ctx_)),
-      mu_() {}
-// TODO(cais): Set GRPC_ARG_MAX_MESSAGE_LENGTH to max if necessary.
-
-bool DebugGrpcChannel::is_channel_ready() {
-  return channel_->GetState(false) == GRPC_CHANNEL_READY;
+    : server_stream_addr_(server_stream_addr),
+      url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {}
+
+Status DebugGrpcChannel::Connect(const int64 timeout_micros) {
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
+  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  channel_ = ::grpc::CreateCustomChannel(
+      server_stream_addr_, ::grpc::InsecureChannelCredentials(), args);
+  if (!channel_->WaitForConnected(
+          gpr_time_add(gpr_now(GPR_CLOCK_REALTIME),
+                       gpr_time_from_micros(timeout_micros, GPR_TIMESPAN)))) {
+    return errors::FailedPrecondition(
+        "Failed to connect to gRPC channel at ", server_stream_addr_,
+        " within a timeout of ", timeout_micros / 1e6, " s.");
+  }
+  stub_ = EventListener::NewStub(channel_);
+  reader_writer_ = stub_->SendEvents(&ctx_);
+  return Status::OK();
 }
 
 bool DebugGrpcChannel::WriteEvent(const Event& event) {
@@ -551,7 +559,11 @@ Status DebugGrpcChannel::ReceiveServerRepliesAndClose() {
 // static
 mutex DebugGrpcIO::streams_mu;
 
-// Static
+// static
+int64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000;
+// TODO(cais): Make this configurable?
+
+// static
 std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
 DebugGrpcIO::GetStreamChannels() {
   static std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
@@ -592,12 +604,8 @@ Status DebugGrpcIO::SendEventProtoThroughGrpcStream(
         stream_channels = GetStreamChannels();
     if (stream_channels->find(grpc_stream_url) == stream_channels->end()) {
       debug_grpc_channel.reset(new DebugGrpcChannel(server_stream_addr));
-
-      if (!debug_grpc_channel->is_channel_ready()) {
-        return errors::FailedPrecondition(
-            strings::StrCat("Channel at the following gRPC stream URL is ",
-                            "not ready: ", grpc_stream_url));
-      }
+      TF_RETURN_IF_ERROR(
+          debug_grpc_channel->Connect(channel_connection_timeout_micros));
 
       (*stream_channels)[grpc_stream_url] = debug_grpc_channel;
       CreateEmptyEnabledSet(grpc_stream_url);
@@ -609,7 +617,7 @@ Status DebugGrpcIO::SendEventProtoThroughGrpcStream(
   bool write_ok = debug_grpc_channel->WriteEvent(event_proto);
   if (!write_ok) {
     return errors::Cancelled(strings::StrCat("Write event to stream URL ",
-                                             grpc_stream_url, "failed."));
+                                             grpc_stream_url, " failed."));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index 7e0c3b1945704b312f277a4a7e3e67001d536225..2785f5bcc1b88b793b1edd7bb39f7307f9fa4c2b 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -215,8 +215,16 @@ class DebugGrpcChannel {
 
   virtual ~DebugGrpcChannel() {}
 
-  // Query whether the gRPC channel is ready for use.
-  bool is_channel_ready();
+  // Attempt to establish connection with server.
+  //
+  // Args:
+  //   timeout_micros: Timeout (in microseconds) for the attempt to establish
+  //     the connection.
+  //
+  // Returns:
+  //   OK Status iff connection is successfully established before timeout,
+  //   otherwise return an error Status.
+  Status Connect(const int64 timeout_micros);
 
   // Write an Event proto to the debug gRPC stream.
   //
@@ -234,6 +242,7 @@ class DebugGrpcChannel {
   Status ReceiveServerRepliesAndClose();
 
  private:
+  string server_stream_addr_;
   string url_;
   ::grpc::ClientContext ctx_;
   std::shared_ptr<::grpc::Channel> channel_;
@@ -302,6 +311,7 @@ class DebugGrpcIO {
   static void CreateEmptyEnabledSet(const string& grpc_debug_url);
 
   static mutex streams_mu;
+  static int64 channel_connection_timeout_micros;
 
   friend class GrpcDebugTest;
   friend class DebugNumericSummaryOpTest;
diff --git a/tensorflow/core/debug/debugger_state_impl.cc b/tensorflow/core/debug/debugger_state_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5752c000215d4fb0ce140a6a130bcc6543d3584
--- /dev/null
+++ b/tensorflow/core/debug/debugger_state_impl.cc
@@ -0,0 +1,66 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/debug/debugger_state_impl.h"
+
+#include "tensorflow/core/debug/debug_graph_utils.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
+
+namespace tensorflow {
+
+DebuggerState::DebuggerState(const DebugOptions& debug_options) {
+  for (const DebugTensorWatch& watch :
+       debug_options.debug_tensor_watch_opts()) {
+    for (const string& url : watch.debug_urls()) {
+      debug_urls_.insert(url);
+    }
+  }
+}
+
+DebuggerState::~DebuggerState() {
+  for (const string& debug_url : debug_urls_) {
+    DebugIO::CloseDebugURL(debug_url).IgnoreError();
+  }
+}
+
+Status DebuggerState::PublishDebugMetadata(
+    const int64 global_step, const int64 session_run_count,
+    const int64 executor_step_count, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<string>& target_names) {
+  return DebugIO::PublishDebugMetadata(global_step, session_run_count,
+                                       executor_step_count, input_names,
+                                       output_names, target_names, debug_urls_);
+}
+
+Status DebugGraphDecorator::DecorateGraph(Graph* graph, Device* device) {
+  DebugNodeInserter::DeparallelizeWhileLoops(graph, device);
+  return DebugNodeInserter::InsertNodes(
+      debug_options_.debug_tensor_watch_opts(), graph, device);
+}
+
+Status DebugGraphDecorator::PublishGraph(const Graph& graph) {
+  std::unordered_set<string> debug_urls;
+  for (const DebugTensorWatch& watch :
+       debug_options_.debug_tensor_watch_opts()) {
+    for (const string& url : watch.debug_urls()) {
+      debug_urls.insert(url);
+    }
+  }
+
+  return DebugIO::PublishGraph(graph, debug_urls);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/debug/debugger_state_impl.h b/tensorflow/core/debug/debugger_state_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d91aa03426af66b493b6f7f4e304c4168be8ec7a
--- /dev/null
+++ b/tensorflow/core/debug/debugger_state_impl.h
@@ -0,0 +1,61 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DEBUGGER_STATE_IMPL_H_
+#define TENSORFLOW_DEBUGGER_STATE_IMPL_H_
+
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
+
+#include <unordered_set>
+#include <vector>
+
+namespace tensorflow {
+
+class DebuggerState : public DebuggerStateInterface {
+ public:
+  DebuggerState(const DebugOptions& debug_options);
+  virtual ~DebuggerState();
+
+  // Publish metadata about the debugged Session::Run() call.
+  //
+  // See the doc string of DebuggerStateInterface::PublishDebugMetadata() for
+  // details.
+  Status PublishDebugMetadata(const int64 global_step,
+                              const int64 session_run_count,
+                              const int64 executor_step_count,
+                              const std::vector<string>& input_names,
+                              const std::vector<string>& output_names,
+                              const std::vector<string>& target_names) override;
+
+ private:
+  std::unordered_set<string> debug_urls_;
+};
+
+class DebugGraphDecorator : public DebugGraphDecoratorInterface {
+ public:
+  DebugGraphDecorator(const DebugOptions& debug_options)
+      : debug_options_(debug_options) {}
+  virtual ~DebugGraphDecorator() {}
+
+  Status DecorateGraph(Graph* graph, Device* device) override;
+  Status PublishGraph(const Graph& graph) override;
+
+ private:
+  DebugOptions debug_options_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DEBUGGER_STATE_IMPL_H_
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c68729410cea69d36015bb86ee7558af1e3d520
--- /dev/null
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -0,0 +1,288 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
+#include "tensorflow/core/protobuf/master.pb.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+static SessionOptions Devices(int num_cpus, int num_gpus) {
+  SessionOptions result;
+  (*result.config.mutable_device_count())["CPU"] = num_cpus;
+  (*result.config.mutable_device_count())["GPU"] = num_gpus;
+  return result;
+}
+
+void CreateGraphDef(GraphDef* graph_def, string node_names[3]) {
+  Graph graph(OpRegistry::Global());
+
+  Tensor a_tensor(DT_FLOAT, TensorShape({1, 2}));
+  test::FillValues<float>(&a_tensor, {1.0, 2.0});
+  Node* a = test::graph::Constant(&graph, a_tensor);
+  node_names[0] = a->name();
+
+  Tensor b_tensor(DT_FLOAT, TensorShape({2, 1}));
+  test::FillValues<float>(&b_tensor, {2.0, 1.0});
+  Node* b = test::graph::Constant(&graph, b_tensor);
+  node_names[1] = b->name();
+
+  // c = a * b
+  Node* c = test::graph::Matmul(&graph, a, b, false, false);
+  node_names[2] = c->name();
+
+  test::graph::ToGraphDef(&graph, graph_def);
+}
+
+// Asserts that "val" is a single float tensor. The only float is
+// "expected_val".
+static void IsSingleFloatValue(const Tensor& val, float expected_val) {
+  ASSERT_EQ(val.dtype(), DT_FLOAT);
+  ASSERT_EQ(val.NumElements(), 1);
+  ASSERT_EQ(val.flat<float>()(0), expected_val);
+}
+
+static SessionOptions Options(const string& target, int placement_period) {
+  SessionOptions options;
+  // NOTE(mrry): GrpcSession requires a grpc:// scheme prefix in the target
+  // string.
+  options.target = strings::StrCat("grpc://", target);
+  options.config.set_placement_period(placement_period);
+  options.config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_opt_level(OptimizerOptions::L0);
+  return options;
+}
+
+static Session* NewRemote(const SessionOptions& options) {
+  return CHECK_NOTNULL(NewSession(options));
+}
+
+class GrpcSessionDebugTest : public ::testing::Test {
+ protected:
+  void SetUp() override { CreateDumpDir(); }
+
+  void TearDown() override { DeleteDumpDir(); }
+
+  void DeleteDumpDir() {
+    if (Env::Default()->IsDirectory(dump_dir_).ok()) {
+      int64 undeleted_files = 0;
+      int64 undeleted_dirs = 0;
+      ASSERT_TRUE(
+          Env::Default()
+              ->DeleteRecursively(dump_dir_, &undeleted_files, &undeleted_dirs)
+              .ok());
+      ASSERT_EQ(0, undeleted_files);
+      ASSERT_EQ(0, undeleted_dirs);
+    }
+  }
+
+  const string GetDebugURL() { return debug_url_; }
+
+  void LoadTensorDumps(const string& subdir, std::vector<Tensor>* tensors) {
+    const string dirpath = io::JoinPath(dump_dir_, subdir);
+    if (!(Env::Default()->IsDirectory(dirpath).ok())) {
+      return;
+    }
+
+    std::vector<string> filenames;
+    TF_ASSERT_OK(Env::Default()->GetChildren(dirpath, &filenames));
+
+    for (const string& filename : filenames) {
+      Event event;
+      TF_ASSERT_OK(ReadEventFromFile(io::JoinPath(dirpath, filename), &event));
+      if (event.summary().value().size() == 1) {
+        Tensor tensor;
+        ASSERT_TRUE(tensor.FromProto(event.summary().value(0).tensor()));
+        tensors->push_back(tensor);
+      }
+    }
+  }
+
+ private:
+  void CreateDumpDir() {
+    char dir_template[] = "/tmp/tfdbg_grpc_sessions_XXXXXX";
+    dump_dir_ = mkdtemp(dir_template);
+    debug_url_ = strings::StrCat("file://", dump_dir_);
+  }
+
+  string dump_dir_;
+  string debug_url_;
+};
+
+TEST_F(GrpcSessionDebugTest, FileDebugURL) {
+  GraphDef graph;
+  string node_names[3];
+  CreateGraphDef(&graph, node_names);
+
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+
+  std::unique_ptr<Session> session(
+      NewRemote(Options(cluster->targets()[0], 1)));
+  ASSERT_TRUE(session != nullptr);
+  TF_CHECK_OK(session->Create(graph));
+
+  // Iteration 0: No watch.
+  // Iterations 1 and 2: Watch one Tensor.
+  // Iterations 3 and 4: Watch two Tensors.
+  // Iteration 5: No watch.
+  for (size_t i = 0; i < 6; ++i) {
+    RunOptions options;
+    if (i >= 1 && i < 5) {
+      DebugOptions* debug_options = options.mutable_debug_options();
+      DebugTensorWatch* watch = debug_options->add_debug_tensor_watch_opts();
+      watch->set_node_name(node_names[0]);
+      watch->set_output_slot(0);
+      watch->add_debug_ops("DebugIdentity");
+      watch->add_debug_urls(GetDebugURL());
+
+      if (i >= 3) {
+        watch = debug_options->add_debug_tensor_watch_opts();
+        watch->set_node_name(node_names[1]);
+        watch->set_output_slot(0);
+        watch->add_debug_ops("DebugIdentity");
+        watch->add_debug_urls(GetDebugURL());
+      }
+    }
+
+    RunMetadata metadata;
+    std::vector<Tensor> outputs;
+    TF_CHECK_OK(
+        session->Run(options, {}, {node_names[2]}, {}, &outputs, &metadata));
+    ASSERT_EQ(1, outputs.size());
+    IsSingleFloatValue(outputs[0], 4.0);
+
+    std::vector<Tensor> dumped_tensors;
+    LoadTensorDumps("n", &dumped_tensors);
+
+    if (i == 0 || i == 5) {
+      ASSERT_EQ(0, dumped_tensors.size());
+    } else {
+      if (i == 1 || i == 2) {
+        ASSERT_EQ(1, dumped_tensors.size());
+        ASSERT_EQ(TensorShape({1, 2}), dumped_tensors[0].shape());
+        ASSERT_EQ(1.0, dumped_tensors[0].flat<float>()(0));
+        ASSERT_EQ(2.0, dumped_tensors[0].flat<float>()(1));
+      } else {
+        ASSERT_EQ(2, dumped_tensors.size());
+      }
+      DeleteDumpDir();
+    }
+  }
+  TF_CHECK_OK(session->Close());
+}
+
+void SetDevice(GraphDef* graph, const string& name, const string& dev) {
+  for (size_t i = 0; i < graph->node_size(); ++i) {
+    if (graph->node(i).name() == name) {
+      graph->mutable_node(i)->set_device(dev);
+      return;
+    }
+  }
+  LOG(FATAL) << "Name '" << name << "' not found.";
+}
+
+TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 1), 2, &cluster));
+  std::unique_ptr<Session> session(
+      NewRemote(Options(cluster->targets()[0], 1000)));
+  ASSERT_TRUE(session != nullptr);
+
+  // b = a
+  Graph graph(OpRegistry::Global());
+  Tensor a_tensor(DT_STRING, TensorShape({2, 2}));
+  for (size_t i = 0; i < 4; ++i) {
+    a_tensor.flat<string>()(i) = "hello, world";
+  }
+  Node* a = test::graph::Constant(&graph, a_tensor);
+  Node* b = test::graph::Identity(&graph, a);
+
+  GraphDef def;
+  test::graph::ToGraphDef(&graph, &def);
+
+  // In this test, we force each node (a, b) on every possible device.
+  // We test all possible cases.
+  for (const auto& a_dev : cluster->devices()) {
+    for (const auto& b_dev : cluster->devices()) {
+      LOG(INFO) << "a: " << a_dev.name() << " b: " << b_dev.name();
+      SetDevice(&def, a->name(), a_dev.name());
+      SetDevice(&def, b->name(), b_dev.name());
+
+      Status s = session->Create(def);
+      if (s.ok()) {
+        std::vector<Tensor> outputs;
+
+        RunOptions options;
+        DebugOptions* debug_options = options.mutable_debug_options();
+        DebugTensorWatch* watch = debug_options->add_debug_tensor_watch_opts();
+        watch->set_node_name(a->name());
+        watch->set_output_slot(0);
+        watch->add_debug_ops("DebugIdentity");
+        watch->add_debug_urls(GetDebugURL());
+
+        RunMetadata metadata;
+        TF_CHECK_OK(
+            session->Run(options, {}, {b->name()}, {}, &outputs, &metadata));
+        ASSERT_EQ(1, outputs.size());
+        ASSERT_EQ(outputs[0].dtype(), DT_STRING);
+        ASSERT_EQ(outputs[0].NumElements(), 4);
+        for (size_t i = 0; i < outputs[0].NumElements(); ++i) {
+          EXPECT_EQ(outputs[0].flat<string>()(i), "hello, world");
+        }
+        TF_CHECK_OK(session->Close());
+
+        std::vector<Tensor> dumped_tensors;
+        LoadTensorDumps("n", &dumped_tensors);
+        ASSERT_EQ(1, dumped_tensors.size());
+        ASSERT_EQ(TensorShape({2, 2}), dumped_tensors[0].shape());
+        for (size_t i = 0; i < 4; ++i) {
+          ASSERT_EQ("hello, world", dumped_tensors[0].flat<string>()(i));
+        }
+
+        DeleteDumpDir();
+      } else {
+        LOG(ERROR) << "Error: " << s;
+        ASSERT_TRUE((a_dev.device_type() == DEVICE_GPU) ||
+                    (b_dev.device_type() == DEVICE_GPU));
+        ASSERT_FALSE(s.ok());
+      }
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 0c2d2b5d5d966be6beb9eb367386a5dcc26cfcf5..d2a828f39f2387d21f99417567a6c467e9651fdf 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -77,7 +77,6 @@ cc_library(
     ],
     deps = [
         ":graph_mgr",
-        ":rendezvous_mgr_interface",
         ":worker_cache",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
@@ -92,9 +91,9 @@ cc_library(
     deps = [
         ":graph_mgr",
         ":worker_session",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
     ],
 )
 
@@ -237,6 +236,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
     ],
 )
@@ -259,6 +259,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/debug:debug_graph_utils",
     ],
 )
 
@@ -329,6 +330,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/debug",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/README.md b/tensorflow/core/distributed_runtime/README.md
index ab1771e29426605eb225e7a1d5df32f82c806649..d22cd2a45bc68ee8ff5015327f5e56c24879b8f9 100644
--- a/tensorflow/core/distributed_runtime/README.md
+++ b/tensorflow/core/distributed_runtime/README.md
@@ -5,6 +5,4 @@ distributed TensorFlow runtime, using [gRPC](http://grpc.io) for inter-process
 communication.
 
 To learn how to use the distributed runtime to create a TensorFlow cluster,
-see the "Distributed TensorFlow" How To, which is available [in this
-repository](../../g3doc/how_tos/distributed/index.md), and will be available
-on the TensorFlow website after the next version is released.
+see the [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) How-To.
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 5863727f19b6f5410c10f699d84389ce95d221f1..e68aea46ecd436d557d8394c3544684965a81878 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -35,9 +35,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-BaseRendezvousMgr::BaseRendezvousMgr(const WorkerEnv* worker_env,
-                                     const string& worker_name)
-    : worker_env_(worker_env), worker_name_(worker_name) {}
+BaseRendezvousMgr::BaseRendezvousMgr(const WorkerEnv* worker_env)
+    : worker_env_(worker_env) {}
 
 BaseRendezvousMgr::~BaseRendezvousMgr() {
   for (auto& p : table_) {
@@ -47,7 +46,7 @@ BaseRendezvousMgr::~BaseRendezvousMgr() {
   }
 }
 
-Rendezvous* BaseRendezvousMgr::Find(int64 step_id) {
+RemoteRendezvous* BaseRendezvousMgr::Find(int64 step_id) {
   return FindOrCreate(step_id);
 }
 
@@ -55,7 +54,7 @@ BaseRemoteRendezvous* BaseRendezvousMgr::FindOrCreate(int64 step_id) {
   mutex_lock l(mu_);
   Table::iterator iter = table_.find(step_id);
   if (iter == table_.end()) {
-    auto rr = Create(step_id, worker_env_, worker_name_);
+    auto rr = Create(step_id, worker_env_);
     iter = table_.insert({step_id, rr}).first;
   }
   iter->second->Ref();
@@ -128,14 +127,12 @@ void BaseRendezvousMgr::CleanupAll() {
   }
 }
 
-BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env,
-                                           const string& worker_name,
-                                           int64 step_id,
+BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id,
                                            bool tolerate_dup_recv)
     : env_(env),
-      worker_name_(worker_name),
       step_id_(step_id),
-      local_(NewLocalRendezvous(tolerate_dup_recv)) {}
+      local_(NewLocalRendezvous(tolerate_dup_recv)),
+      session_(nullptr) {}
 
 BaseRemoteRendezvous::~BaseRemoteRendezvous() {
   CHECK(active_.empty());
@@ -150,6 +147,41 @@ static bool IsLocalDevice(const string& worker_name,
   return device_name.starts_with(worker_name);
 }
 
+Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
+  CHECK_NE(session, nullptr) << "session must not be null!";
+  std::vector<DeferredCall> deferred_calls;
+  {
+    mutex_lock l(mu_);
+    if (session_ != nullptr) {
+      if (session_->worker_name == session->worker_name) {
+        LOG(INFO) << "Skipping rendezvous re-initialization.";
+        return Status::OK();
+      }
+      Status s = errors::Internal(
+          "Double init! Worker names would have changed from: ",
+          session_->worker_name, " -> ", session->worker_name);
+      LOG(WARNING) << s;
+      return s;
+    }
+    session_ = session;
+    std::swap(deferred_calls, deferred_calls_);
+  }
+  for (DeferredCall& call : deferred_calls) {
+    RecvLocalAsyncInternal(call.parsed, std::move(call.done));
+  }
+  return Status::OK();
+}
+
+WorkerSession* BaseRemoteRendezvous::session() {
+  mutex_lock l(mu_);
+  return session_;
+}
+
+bool BaseRemoteRendezvous::is_initialized() {
+  mutex_lock l(mu_);
+  return is_initialized_locked();
+}
+
 Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
                                   const Rendezvous::Args& args,
                                   const Tensor& val, const bool is_dead) {
@@ -157,10 +189,12 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
   {
     mutex_lock l(mu_);
     if (!status_.ok()) return status_;
-  }
-  if (!IsLocalDevice(worker_name_, parsed.src_device)) {
-    return errors::InvalidArgument("Invalid rendezvous key (src): ",
-                                   parsed.FullKey(), " @ ", worker_name_);
+    DCHECK(is_initialized_locked());
+    if (!IsLocalDevice(session_->worker_name, parsed.src_device)) {
+      return errors::InvalidArgument(
+          "Invalid rendezvous key (src): ", parsed.FullKey(), " @ ",
+          session_->worker_name);
+    }
   }
   // Buffers "val" and "device_context" in local_.
   return local_->Send(parsed, args, val, is_dead);
@@ -168,17 +202,24 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
 
 Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
                                              bool is_src) {
+  // Cache session pointer to avoid repeatedly taking & releasing the lock
+  // (e.g. calling session())
+  WorkerSession* sess = nullptr;
   {
     mutex_lock l(mu_);
     if (!status_.ok()) return status_;
+    if (!is_initialized_locked()) {
+      return errors::Internal("ValidateDevices called before initialization.");
+    }
+    sess = session_;
   }
-  if (is_src && !IsLocalDevice(worker_name_, parsed.src_device)) {
+  if (is_src && !IsLocalDevice(sess->worker_name, parsed.src_device)) {
     return errors::InvalidArgument("Invalid rendezvous key (src): ",
-                                   parsed.FullKey(), " @ ", worker_name_);
+                                   parsed.FullKey(), " @ ", sess->worker_name);
   }
-  if (!is_src && !IsLocalDevice(worker_name_, parsed.dst_device)) {
+  if (!is_src && !IsLocalDevice(sess->worker_name, parsed.dst_device)) {
     return errors::InvalidArgument("Invalid rendezvous key (dst): ",
-                                   parsed.FullKey(), " @ ", worker_name_);
+                                   parsed.FullKey(), " @ ", sess->worker_name);
   }
   return Status::OK();
 }
@@ -244,6 +285,7 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
                                      const Rendezvous::Args& recv_args,
                                      DoneCallback done) {
   VLOG(1) << "RemoteRendezvous Recv " << this << " " << parsed.FullKey();
+  CHECK(is_initialized()) << "RecvAsync called when uninitialized.";
   Status s = ValidateDevices(parsed, false /*!is_src*/);
   if (!s.ok()) {
     done(s, Args(), recv_args, Tensor(), false);
@@ -280,6 +322,26 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
 
 void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
                                           DoneCallback done) {
+  {
+    mutex_lock l(mu_);
+    if (!is_initialized_locked()) {
+      // RecvLocalAsync can be called (due to an incoming RecvTensor RPC from a
+      // remote worker) before the RunStep (or PartialRunStep) RPC from the
+      // master arrives. RecvLocalAsync thus buffers the arguments until after
+      // the RemoteRendezvous is Initialize()'d, when it completes the
+      // rendezvous logic. At some point after Initialize() is called, a Tensor
+      // is produced locally that will then be sent in response to the incoming
+      // RPC.
+      DeferredCall call(parsed, std::move(done));
+      deferred_calls_.push_back(call);
+      return;
+    }
+  }
+  RecvLocalAsyncInternal(parsed, std::move(done));
+}
+
+void BaseRemoteRendezvous::RecvLocalAsyncInternal(const ParsedKey& parsed,
+                                                  DoneCallback done) {
   Status s = ValidateDevices(parsed, true /* is_src */);
   if (!s.ok()) {
     done(s, Args(), Args(), Tensor(), false);
@@ -318,4 +380,8 @@ void BaseRemoteRendezvous::DeregisterCall(BaseRecvTensorCall* call) {
   active_.erase(call);
 }
 
+BaseRemoteRendezvous::DeferredCall::DeferredCall(const ParsedKey& parsed,
+                                                 DoneCallback done)
+    : parsed(parsed), done(std::move(done)) {}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index 447a75913d64bb58aea51de0c571cab4897dc448..b252f45fe96354f8e2a91a5aa3a05f1a937e3939 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -59,15 +59,17 @@ class BaseRecvTensorCall;
 // RendezvousMgr must have keys generated by Rendezvous::CreateKey().
 class BaseRendezvousMgr : public RendezvousMgrInterface {
  public:
-  explicit BaseRendezvousMgr(const WorkerEnv* worker_env,
-                             const string& worker_name);
+  explicit BaseRendezvousMgr(const WorkerEnv* worker_env);
 
   ~BaseRendezvousMgr() override;
 
   // Returns Rendezvous supporting send and recv among workers in the
   // "step_id".  The caller takes ownership of one reference on the
   // returned Rendezvous instance.
-  Rendezvous* Find(int64 step_id) override;
+  //
+  // Note: the caller must guarantee to eventually call Initialize on the
+  // returned RemoteRendezvous
+  RemoteRendezvous* Find(int64 step_id) override;
 
   // Finds the local rendezvous instance for the "step_id".  Runs
   // "done" when the tensor for "key" is produced or an error occurs.
@@ -91,8 +93,7 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
 
  protected:
   virtual BaseRemoteRendezvous* Create(int64 step_id,
-                                       const WorkerEnv* worker_env,
-                                       const string& worker_name) = 0;
+                                       const WorkerEnv* worker_env) = 0;
 
  private:
   // Maps step_id to rendezvous.
@@ -100,7 +101,6 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
 
   // Not owned.
   const WorkerEnv* const worker_env_;
-  const string worker_name_;
 
   mutex mu_;
   Table table_ GUARDED_BY(mu_);
@@ -116,10 +116,13 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
 // Buffering of Tensor values is delegated to a "local" Rendezvous
 // obtained from NewLocalRendezvous().  This class just adds
 // functionality to coordinate with remote workers.
-class BaseRemoteRendezvous : public Rendezvous {
+class BaseRemoteRendezvous : public RemoteRendezvous {
  public:
-  BaseRemoteRendezvous(const WorkerEnv* env, const string& worker_name,
-                       int64 step_id, bool tolerate_dup_recv);
+  BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id,
+                       bool tolerate_dup_recv);
+
+  // Upgrades the BaseRemoteRendezvous to full initialization.
+  Status Initialize(WorkerSession* session) override;
 
   // Forwards to local_, where the Tensor "val" will be buffered and
   // any waiting callback stored.
@@ -163,10 +166,13 @@ class BaseRemoteRendezvous : public Rendezvous {
   // Removes "call" from active_ if "call" is in active_.
   void DeregisterCall(BaseRecvTensorCall* call);
 
+  WorkerSession* session();
+
+  bool is_initialized();
+
   ~BaseRemoteRendezvous() override;
 
   const WorkerEnv* const env_;  // Not owned.
-  const string worker_name_;
   const int64 step_id_;
 
  private:
@@ -176,10 +182,24 @@ class BaseRemoteRendezvous : public Rendezvous {
 
   // Status given by StartAbort() if any.
   Status status_ GUARDED_BY(mu_);
+  WorkerSession* session_ GUARDED_BY(mu_);  // Not owned.
+
+  // Data structures to handle calls when partially initialized.
+  struct DeferredCall {
+    const ParsedKey parsed;
+    DoneCallback done;
+
+    DeferredCall(const ParsedKey& parsed, DoneCallback done);
+  };
+  std::vector<DeferredCall> deferred_calls_ GUARDED_BY(mu_);
 
   // Active outstanding RecvTensor calls.
   gtl::FlatSet<BaseRecvTensorCall*> active_ GUARDED_BY(mu_);
 
+  bool is_initialized_locked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return session_ != nullptr;
+  }
+
   // If "is_src" is true, checks that the rendezvous key "parsed"'s
   // source is in this process. If "is_src" is false, checks that the
   // rendezvous key "parsed"'s destination is in this process.
@@ -194,6 +214,9 @@ class BaseRemoteRendezvous : public Rendezvous {
                           const Rendezvous::Args& out_args, const Tensor& in,
                           Tensor* out, StatusCallback done);
 
+  // Must be called only if fully initialized.
+  void RecvLocalAsyncInternal(const ParsedKey& parsed, DoneCallback done);
+
   TF_DISALLOW_COPY_AND_ASSIGN(BaseRemoteRendezvous);
 };
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 36b7b5b6281e04ee1118d92d64578d21604d0aa6..5bde771e8deba892288740583851111de37405d0 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -45,10 +46,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-GraphMgr::GraphMgr(const WorkerEnv* worker_env,
-                   RendezvousMgrInterface* rendezvous_mgr)
-    : worker_env_(worker_env), rendezvous_mgr_(rendezvous_mgr), table_(5) {
-  CHECK(rendezvous_mgr) << "Rendezvous mgr was null";
+GraphMgr::GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr)
+    : worker_env_(worker_env), device_mgr_(device_mgr), table_(5) {
   // The default value of sync_on_finish will be flipped soon and this
   // environment variable will be removed as well.
   Status status =
@@ -94,6 +93,16 @@ static Status ValidateGraphDefForDevices(const GraphDef& gdef) {
   return Status::OK();
 }
 
+Status GraphMgr::DecorateAndPublishGraphForDebug(
+    const DebugOptions& debug_options, Graph* graph, Device* device) {
+  std::unique_ptr<DebugGraphDecoratorInterface> decorator;
+  TF_RETURN_IF_ERROR(
+      DebugGraphDecoratorRegistry::CreateDecorator(debug_options, &decorator));
+  TF_RETURN_IF_ERROR(decorator->DecorateGraph(graph, device));
+  TF_RETURN_IF_ERROR(decorator->PublishGraph(*graph));
+  return Status::OK();
+}
+
 // Creates executors given a graph definition "gdef" of a "session".
 // If a node in "gdef" is shared by other graphs in "session", the
 // same op kernel is reused. E.g., typically a params node is shared
@@ -106,7 +115,8 @@ static Status ValidateGraphDefForDevices(const GraphDef& gdef) {
 // "executors" are filled with one executor per device if success and
 // the caller takes the ownership of returned executors.
 Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
-                          const GraphOptions& graph_options, Item* item) {
+                          const GraphOptions& graph_options,
+                          const DebugOptions& debug_options, Item* item) {
   item->session = session;
   item->lib_def =
       new FunctionLibraryDefinition(OpRegistry::Global(), gdef.library());
@@ -136,7 +146,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
   };
   popts.get_incarnation = [this](const string& name) -> int64 {
     Device* device = nullptr;
-    Status s = worker_env_->device_mgr->LookupDevice(name, &device);
+    Status s = device_mgr_->LookupDevice(name, &device);
     if (s.ok()) {
       return device->attributes().incarnation();
     } else {
@@ -181,8 +191,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     ExecutionUnit* unit = &(item->units.back());
 
     // Find the device.
-    Status s =
-        worker_env_->device_mgr->LookupDevice(device_name, &unit->device);
+    Status s = device_mgr_->LookupDevice(device_name, &unit->device);
     if (!s.ok()) {
       // Remove the empty unit from the item as the item destructor wants all
       // units to have valid devices.
@@ -202,7 +211,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 
     // Function library runtime.
     unit->lib = NewFunctionLibraryRuntime(
-        worker_env_->device_mgr, worker_env_->env, unit->device,
+        device_mgr_, worker_env_->env, unit->device,
         subgraph->versions().producer(), item->lib_def,
         graph_options.optimizer_options());
 
@@ -232,6 +241,13 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     };
 
     optimizer.Optimize(lib, worker_env_->env, params.device, &subgraph);
+
+    // EXPERIMENTAL: tfdbg inserts debug nodes (i.e., probes) to the graph.
+    if (!debug_options.debug_tensor_watch_opts().empty()) {
+      TF_RETURN_IF_ERROR(DecorateAndPublishGraphForDebug(
+          debug_options, subgraph.get(), params.device));
+    }
+
     TF_RETURN_IF_ERROR(
         EnsureMemoryTypes(DeviceType(unit->device->device_type()),
                           unit->device->name(), subgraph.get()));
@@ -247,9 +263,10 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 }
 
 Status GraphMgr::Register(const string& session, const GraphDef& gdef,
-                          const GraphOptions& graph_options, string* handle) {
+                          const GraphOptions& graph_options,
+                          const DebugOptions& debug_options, string* handle) {
   Item* item = new Item;
-  Status s = InitItem(session, gdef, graph_options, item);
+  Status s = InitItem(session, gdef, graph_options, debug_options, item);
   if (!s.ok()) {
     item->Unref();
     return s;
@@ -399,14 +416,14 @@ void GraphMgr::RecvOutputsFromRendezvousAsync(Rendezvous* rendezvous,
 }
 
 Status GraphMgr::SendInputs(const int64 step_id, const NamedTensors& in) {
-  Rendezvous* rendezvous = rendezvous_mgr_->Find(step_id);
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = SendInputsToRendezvous(rendezvous, in);
   rendezvous->Unref();
   return s;
 }
 
 Status GraphMgr::RecvOutputs(const int64 step_id, NamedTensors* out) {
-  Rendezvous* rendezvous = rendezvous_mgr_->Find(step_id);
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = RecvOutputsFromRendezvous(rendezvous, out);
   rendezvous->Unref();
   return s;
@@ -414,7 +431,7 @@ Status GraphMgr::RecvOutputs(const int64 step_id, NamedTensors* out) {
 
 void GraphMgr::RecvOutputsAsync(const int64 step_id, NamedTensors* out,
                                 StatusCallback done) {
-  Rendezvous* rendezvous = rendezvous_mgr_->Find(step_id);
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   RecvOutputsFromRendezvousAsync(rendezvous, out,
                                  [done, rendezvous](const Status s) {
                                    rendezvous->Unref();
@@ -423,7 +440,8 @@ void GraphMgr::RecvOutputsAsync(const int64 step_id, NamedTensors* out,
 }
 
 void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
-                            const ExecutorOpts& opts,
+                            WorkerSession* session,
+                            const ExecutorOpts& /*opts*/,
                             StepStatsCollector* collector,
                             CostGraphDef* cost_graph,
                             CancellationManager* cancellation_manager,
@@ -444,10 +462,14 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
     return;
   }
 
-  Rendezvous* rendezvous = rendezvous_mgr_->Find(step_id);
+  RemoteRendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
+  Status s = rendezvous->Initialize(session);
 
   // Sends values specified by the caller.
-  Status s = SendInputsToRendezvous(rendezvous, in);
+  if (s.ok()) {
+    s = SendInputsToRendezvous(rendezvous, in);
+  }
+
   if (!s.ok()) {
     done(s);
     item->Unref();
@@ -472,10 +494,9 @@ void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
                                       StatusCallback done) {
   const int num_units = item->units.size();
   CHECK_GE(num_units, 1);
-  ScopedStepContainer* step_container =
-      new ScopedStepContainer(step_id, [this](const string& name) {
-        worker_env_->device_mgr->ClearContainers({name});
-      });
+  ScopedStepContainer* step_container = new ScopedStepContainer(
+      step_id,
+      [this](const string& name) { device_mgr_->ClearContainers({name}); });
   // NOTE: Transfer one ref of rendezvous and item.
   ExecutorBarrier* barrier =
       new ExecutorBarrier(num_units, rendezvous,
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 5f51d6385782feb5d801c3c73fd32effa8c52397..50391f47e4d46b885b320803f76bb0da5015ed56 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -30,12 +30,15 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
 
 namespace tensorflow {
 
 class ExecutorOpts;
 class StepStatsCollector;
 class RendezvousMgrInterface;
+class DeviceMgr;
+struct WorkerSession;
 
 // GraphMgr keeps track of a set of graphs that are registered with a
 // TensorFlow worker. Each registered graph is identified by a handle
@@ -61,13 +64,13 @@ class RendezvousMgrInterface;
 //   EXPECT_EQ(out["c"], Tensor({4, 6}));
 class GraphMgr {
  public:
-  explicit GraphMgr(const WorkerEnv* worker_env,
-                    RendezvousMgrInterface* rendezvous_mgr);
+  explicit GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr);
   ~GraphMgr();
 
   // Registers a graph. Fills in "handle"
   Status Register(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options, string* handle);
+                  const GraphOptions& graph_options,
+                  const DebugOptions& debug_options, string* handle);
 
   // Executes one step of a registered graph "handle".
   //
@@ -76,8 +79,8 @@ class GraphMgr {
   typedef std::map<string, Tensor> NamedTensors;
   typedef std::function<void(const Status&)> StatusCallback;
   void ExecuteAsync(const string& handle, const int64 step_id,
-                    const ExecutorOpts& opts, StepStatsCollector* collector,
-                    CostGraphDef* cost_graph,
+                    WorkerSession* session, const ExecutorOpts& opts,
+                    StepStatsCollector* collector, CostGraphDef* cost_graph,
                     CancellationManager* cancellation_manager,
                     const NamedTensors& in, StatusCallback done);
 
@@ -129,7 +132,7 @@ class GraphMgr {
   };
 
   const WorkerEnv* worker_env_;             // Not owned.
-  RendezvousMgrInterface* rendezvous_mgr_;  // Not owned.
+  DeviceMgr* device_mgr_;
 
   CostModelManager cost_model_manager_;
 
@@ -167,7 +170,11 @@ class GraphMgr {
                                       const StatusCallback& done);
 
   Status InitItem(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options, Item* item);
+                  const GraphOptions& graph_options,
+                  const DebugOptions& debug_options, Item* item);
+
+  Status DecorateAndPublishGraphForDebug(const DebugOptions& debug_options,
+                                         Graph* graph, Device* device);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GraphMgr);
 };
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index b4adee3bf6cc45e7a6b1060e3a013004bd22faae..e860c99d95326a320224a924e4cc571a10475202 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/remote_device.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
@@ -48,12 +49,17 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+namespace {
+const char* const kGrpcProtocol = "grpc://";
+}  // namespace
+
 Master::Master(MasterEnv* env, double session_gc_seconds)
     : env_(env),
       last_1000_steps_(1000),
@@ -290,25 +296,122 @@ void Master::CreateSession(const CreateSessionRequest* req,
                            CreateSessionResponse* resp, MyClosure done) {
   SchedClosure([this, req, resp, done]() {
     Status status;
+    WorkerCacheFactoryOptions worker_cache_factory_options;
+    string grpc_protocol("grpc");
+    worker_cache_factory_options.protocol = &grpc_protocol;
     auto call_done = gtl::MakeCleanup([&status, &done] { done(status); });
     status = ValidateExternalGraphDefSyntax(req->graph_def());
     if (!status.ok()) return;
-    // Ping all the workers and build the list of devices that the
-    // session will use.
+
+    // The following 4 variables are set differently, depending on whether this
+    // session uses a client-provided clusterspec or not.
+    WorkerCacheInterface* worker_cache = nullptr;
+    // Note: worker_cache_ptr will be null except if this session is using a
+    // client-supplied ClusterDef (ClusterSpec propagation).
+    std::unique_ptr<WorkerCacheInterface> worker_cache_ptr;
+    std::unique_ptr<DeviceSet> device_set;
     // TODO(saeta): Convert to std::make_unique when available.
     std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devices(
         new std::vector<std::unique_ptr<Device>>());
-    status = DeviceFinder::GetRemoteDevices(req->config().device_filters(),
-                                            env_, env_->worker_cache,
-                                            remote_devices.get());
-    if (!status.ok()) return;
+
+    if (req->config().has_cluster_def()) {
+      worker_cache_factory_options.cluster_def = &req->config().cluster_def();
+
+      // Set the server_def's job_name and task_index fields.
+      string normalized_string;
+      string grpc_protocol(kGrpcProtocol);
+      if (req->target().compare(0, grpc_protocol.length(), grpc_protocol) ==
+          0) {
+        normalized_string =
+            req->target().substr(grpc_protocol.length(), string::npos);
+      } else {
+        normalized_string = req->target();
+      }
+      for (auto&& job : req->config().cluster_def().job()) {
+        for (auto&& task : job.tasks()) {
+          if (task.second == normalized_string) {
+            if (worker_cache_factory_options.job_name != nullptr) {
+              status = errors::InvalidArgument(
+                  "Found multiple matching tasks that correspond to "
+                  "to the master. Master target: '",
+                  req->target(), "'. ClusterDef: ",
+                  req->config().cluster_def().ShortDebugString());
+              LOG(ERROR) << status;
+              return;
+            }
+            if (env_->local_devices[0]->parsed_name().job == job.name() &&
+                env_->local_devices[0]->parsed_name().task == task.first) {
+              // TODO(b/37868888): Remove this limitation when resolved
+              status = errors::InvalidArgument(
+                  "The ClusterSpec names the job and task index to be the same "
+                  "names that were provided when the server booted. This is "
+                  "currently not allowed. Job: ",
+                  job.name(), ", task index: ", task.first);
+              return;
+            }
+            worker_cache_factory_options.job_name = &job.name();
+            worker_cache_factory_options.task_index = task.first;
+          }
+        }
+      }
+
+      // Create the worker cache from the computed server_def.
+      status = env_->worker_cache_factory(worker_cache_factory_options,
+                                          &worker_cache);
+      if (!status.ok()) return;
+      worker_cache_ptr = std::unique_ptr<WorkerCacheInterface>(worker_cache);
+      // Ping all the workers and build the list of devices that the
+      // session will use.
+      status =
+          DeviceFinder::GetRemoteDevices(req->config().device_filters(), env_,
+                                         worker_cache, remote_devices.get());
+      if (!status.ok()) return;
+      device_set.reset(new DeviceSet);
+      for (auto&& d : *remote_devices) {
+        device_set->AddDevice(d.get());
+        DeviceNameUtils::ParsedName name = d->parsed_name();
+        if (name.job == *worker_cache_factory_options.job_name &&
+            name.task == worker_cache_factory_options.task_index &&
+            name.type == "CPU") {
+          device_set->set_client_device(d.get());
+        }
+      }
+    } else {
+      worker_cache = env_->worker_cache;
+      // Ping all the workers and build the list of devices that the
+      // session will use.
+      status =
+          DeviceFinder::GetRemoteDevices(req->config().device_filters(), env_,
+                                         worker_cache, remote_devices.get());
+      if (!status.ok()) return;
+      device_set.reset(new DeviceSet);
+      for (auto&& d : *remote_devices) {
+        device_set->AddDevice(d.get());
+      }
+      int num_local_devices = 0;
+      for (Device* d : env_->local_devices) {
+        device_set->AddDevice(d);
+        if (num_local_devices == 0) {
+          // Uses the first local device as the client device.
+          device_set->set_client_device(d);
+        }
+        num_local_devices++;
+      }
+    }
+
+    CHECK(device_set->client_device());
+
     SessionOptions options;
     options.config = req->config();
-    MasterSession* session =
-        env_->master_session_factory(options, env_, std::move(remote_devices));
+
+    MasterSession* session = env_->master_session_factory(
+        options, env_, std::move(remote_devices), std::move(worker_cache_ptr),
+        std::move(device_set));
+
     GraphDef* gdef =
         const_cast<CreateSessionRequest*>(req)->mutable_graph_def();
-    status = session->Create(gdef);
+
+    status = session->Create(gdef, worker_cache_factory_options);
     if (!status.ok()) {
       session->Close().IgnoreError();
       session->Unref();
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index a155bd384d84d10c79fab70b8c8e5225f89eb962..bb548adda1586a65f1914f322ce800ebb84a474f 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -19,17 +19,41 @@ limitations under the License.
 #include <functional>
 #include <vector>
 
-#include "tensorflow/core/distributed_runtime/master_session.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
 class Device;
+class DeviceSet;
 class Env;
 class MasterSession;
 class OpRegistryInterface;
 class WorkerCacheInterface;
 
+// Options passed to the worker_cache_factory function.
+struct WorkerCacheFactoryOptions {
+  const ClusterDef* cluster_def = nullptr;
+  const string* job_name = nullptr;
+  int task_index;
+  const string* protocol = nullptr;
+
+  WorkerCacheFactoryOptions() {}
+
+  // Construct from a ServerDef proto.
+  //
+  // Note: server_def must outlive WorkerCacheFactoryOptions!
+  WorkerCacheFactoryOptions(const ServerDef& server_def) {
+    if (server_def.has_cluster() && !server_def.job_name().empty()) {
+      cluster_def = &server_def.cluster();
+      job_name = &server_def.job_name();
+      task_index = server_def.task_index();
+      protocol = &server_def.protocol();
+    }
+  }
+};
+
 // The master environment class, which holds a bag of pointers to
 // per-master state.
 //
@@ -57,8 +81,14 @@ struct MasterEnv {
   // `MasterEnv*` is retained by the caller.
   std::function<MasterSession*(
       SessionOptions, MasterEnv*,
-      std::unique_ptr<std::vector<std::unique_ptr<Device>>>)>
+      std::unique_ptr<std::vector<std::unique_ptr<Device>>>,
+      std::unique_ptr<WorkerCacheInterface>,
+      std::unique_ptr<DeviceSet> device_set)>
       master_session_factory;
+
+  std::function<Status(const WorkerCacheFactoryOptions&,
+                       WorkerCacheInterface**)>
+      worker_cache_factory;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index cec956ba4947fec091bd3c7bc99220461bbf7201..dddff4dce486b6fa6d6dea29620d7802badcdf3a 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/profile_handler.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
+#include "tensorflow/core/debug/debug_graph_utils.h"
 #include "tensorflow/core/distributed_runtime/scheduler.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
@@ -35,11 +36,13 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -60,13 +63,14 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   ReffedClientGraph(const string& handle, const BuildGraphOptions& bopts,
                     std::unique_ptr<SimpleClientGraph> cg,
                     const SessionOptions& session_opts,
-                    StatsPublisherFactory stats_publisher_factory,
+                    const StatsPublisherFactory& stats_publisher_factory,
                     SimpleGraphExecutionState* execution_state, bool is_partial,
                     WorkerCacheInterface* worker_cache)
       : session_handle_(handle),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
+        debug_opts_(bopts.debug_options),
         worker_cache_(worker_cache) {
     VLOG(1) << "Created ReffedClientGraph for node with "
             << client_graph_->graph.num_node_ids();
@@ -160,7 +164,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   // Partitions the graph into subgraphs and registers them on
   // workers.
   Status RegisterPartitions(const PartitionOptions& popts,
-                            const FunctionDefLibrary& func_def_lib);
+                            const FunctionLibraryDefinition& flib_def);
 
   // Runs one step of all partitions.
   Status RunPartitions(const MasterEnv* env, int64 step_id,
@@ -184,7 +188,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                       const RunState* run_state,
                       SimpleGraphExecutionState* execution_state);
 
-  string DetailText(const NodeDef& def, const NodeExecStats& ns) {
+  string DetailText(const Node& node, const NodeExecStats& ns) {
     int64 tot = 0;
     for (auto& no : ns.output()) {
       tot += no.tensor_description().allocation_description().requested_bytes();
@@ -193,12 +197,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     if (tot >= 0.1 * 1048576.0) {
       bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
     }
-    return strings::StrCat(
-        bytes, def.name(), " = ", def.op(), "(",
-        str_util::Join(
-            std::vector<StringPiece>(def.input().begin(), def.input().end()),
-            ", "),
-        ")");
+    return strings::StrCat(bytes, node.name(), " = ", node.type_string(), "(",
+                           str_util::Join(node.requested_inputs(), ", "), ")");
   }
 
  private:
@@ -206,6 +206,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const std::unique_ptr<SimpleClientGraph> client_graph_;
   const SessionOptions session_opts_;
   const bool is_partial_;
+  const DebugOptions& debug_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   std::unordered_map<StringPiece, Node*, StringPiece::Hasher> name_to_node_;
 
@@ -270,7 +271,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 };
 
 Status MasterSession::ReffedClientGraph::RegisterPartitions(
-    const PartitionOptions& popts, const FunctionDefLibrary& func_def_lib) {
+    const PartitionOptions& popts, const FunctionLibraryDefinition& flib_def) {
   {  // Ensure register once.
     mu_.lock();
     if (!init_started_) {
@@ -289,7 +290,8 @@ Status MasterSession::ReffedClientGraph::RegisterPartitions(
           graph_defs_for_publishing.push_back(&name_def.second);
         }
         stats_publisher_->PublishGraphProto(graph_defs_for_publishing);
-        s = DoRegisterPartitions(popts, func_def_lib, std::move(graph_defs));
+        s = DoRegisterPartitions(popts, flib_def.ToProto(),
+                                 std::move(graph_defs));
       }
       mu_.lock();
       init_result_ = s;
@@ -406,6 +408,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     // For simplicity, we ship the library completely to every worker.
     *c->req.mutable_graph_def()->mutable_library() = func_def_lib;
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
+    *c->req.mutable_debug_options() = debug_opts_;
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -523,6 +526,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       c->req->set_is_partial(is_partial_);
       c->req->set_is_last_partial_run(is_last_partial_run);
     }
+    c->req->set_session_handle(session_handle_);
     c->req->set_graph_handle(part.graph_handle);
     c->req->set_step_id(step_id);
     *c->req->mutable_exec_opts() = exec_opts;
@@ -782,7 +786,7 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
       if (!ns.timeline_label().empty()) {
         details = ns.timeline_label();
       } else if (found_node_in_graph) {
-        details = DetailText(node->def(), ns);
+        details = DetailText(*node, ns);
       } else {
         // Leave details string empty
       }
@@ -803,11 +807,13 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
     SimpleGraphExecutionState* execution_state) {
   // Build the set of pending feeds that we haven't seen.
   std::unordered_set<TensorId, TensorId::Hasher> pending_feeds;
-  for (const string& feed : run_state->pending_inputs) {
-    TensorId id(ParseTensorName(feed));
+  for (const auto& input : run_state->pending_inputs) {
+    // Skip if already fed.
+    if (input.second) continue;
+    TensorId id(ParseTensorName(input.first));
     auto it = name_to_node_.find(id.first);
     if (it == name_to_node_.end()) {
-      return errors::NotFound("Feed ", feed, ": not found");
+      return errors::NotFound("Feed ", input.first, ": not found");
     }
     pending_feeds.insert(id);
   }
@@ -864,6 +870,7 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
     // The graph handle may be empty if we failed during partition registration.
     if (!part.graph_handle.empty()) {
       Call* c = new Call;
+      c->req.set_session_handle(session_handle_);
       c->req.set_graph_handle(part.graph_handle);
       // NOTE(mrry): We must capture `worker_cache_` since `this`
       // could be deleted before the callback is called.
@@ -897,6 +904,10 @@ void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
     opts->target_nodes.push_back(req.target_name(i));
   }
 
+  if (!req.options().debug_options().debug_tensor_watch_opts().empty()) {
+    opts->debug_options = req.options().debug_options();
+  }
+
   std::sort(opts->feed_endpoints.begin(), opts->feed_endpoints.end());
   std::sort(opts->target_nodes.begin(), opts->target_nodes.end());
   std::sort(opts->fetch_endpoints.begin(), opts->fetch_endpoints.end());
@@ -914,6 +925,8 @@ void BuildBuildGraphOptions(const PartialRunSetupRequest& req,
     opts->target_nodes.push_back(target);
   }
 
+  // TODO(cais): Add TFDBG support to partial runs.
+
   std::sort(opts->feed_endpoints.begin(), opts->feed_endpoints.end());
   std::sort(opts->target_nodes.begin(), opts->target_nodes.end());
   std::sort(opts->fetch_endpoints.begin(), opts->fetch_endpoints.end());
@@ -930,6 +943,13 @@ uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
   for (const string& name : opts.fetch_endpoints) {
     h = Hash64(name.c_str(), name.size(), h);
   }
+
+  if (!opts.debug_options.debug_tensor_watch_opts().empty()) {
+    const string watch_summary = SummarizeDebugTensorWatches(
+        opts.debug_options.debug_tensor_watch_opts());
+    h = Hash64(watch_summary.c_str(), watch_summary.size(), h);
+  }
+
   return h;
 }
 
@@ -953,31 +973,25 @@ string BuildGraphOptionsString(const BuildGraphOptions& opts) {
 MasterSession::MasterSession(
     const SessionOptions& opt, const MasterEnv* env,
     std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+    std::unique_ptr<WorkerCacheInterface> worker_cache,
+    std::unique_ptr<DeviceSet> device_set,
     StatsPublisherFactory stats_publisher_factory)
     : session_opts_(opt),
       env_(env),
       handle_(strings::FpToString(random::New64())),
       remote_devs_(std::move(remote_devs)),
+      worker_cache_(std::move(worker_cache)),
+      devices_(std::move(device_set)),
       stats_publisher_factory_(std::move(stats_publisher_factory)),
       graph_version_(0),
       run_graphs_(5),
       partial_run_graphs_(5) {
   UpdateLastAccessTime();
+  CHECK(devices_) << "device_set was null!";
 
   VLOG(1) << "Session " << handle_ << " #local " << env->local_devices.size()
           << " #remote " << remote_devs_->size();
-  for (auto&& d : *remote_devs_) {
-    devices_.AddDevice(d.get());
-  }
-  int num_local_devices = 0;
-  for (Device* d : env->local_devices) {
-    devices_.AddDevice(d);
-    if (num_local_devices == 0) {
-      // Uses the first local device as the client device.
-      devices_.set_client_device(d);
-    }
-    num_local_devices++;
-  }
+
   LOG(INFO) << "Start master session " << handle_
             << " with config: " << std::endl
             << session_opts_.config.DebugString();
@@ -992,7 +1006,8 @@ void MasterSession::UpdateLastAccessTime() {
   last_access_time_usec_.store(Env::Default()->NowMicros());
 }
 
-Status MasterSession::Create(GraphDef* graph_def) {
+Status MasterSession::Create(GraphDef* graph_def,
+                             const WorkerCacheFactoryOptions& options) {
   if (session_opts_.config.graph_options().place_pruned_graph()) {
     // TODO(b/29900832): Fix this or remove the option.
     LOG(WARNING) << "Distributed session does not support the "
@@ -1000,17 +1015,93 @@ Status MasterSession::Create(GraphDef* graph_def) {
     session_opts_.config.mutable_graph_options()->set_place_pruned_graph(false);
   }
 
-  SimpleGraphExecutionStateOptions options;
-  options.device_set = &devices_;
-  options.session_options = &session_opts_;
+  SimpleGraphExecutionStateOptions execution_options;
+  execution_options.device_set = devices_.get();
+  execution_options.session_options = &session_opts_;
   {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(SimpleGraphExecutionState::MakeForBaseGraph(
-        graph_def, options, &execution_state_));
+        graph_def, execution_options, &execution_state_));
+  }
+  if (options.cluster_def != nullptr) {
+    return CreateWorkerSessions(options);
   }
   return Status::OK();
 }
 
+Status MasterSession::CreateWorkerSessions(
+    const WorkerCacheFactoryOptions& options) {
+  CHECK(worker_cache_) << "CreateWorkerSessions should be called only with "
+                       << "dynamic cluster membership.";
+  std::vector<string> worker_names;
+  worker_cache_->ListWorkers(&worker_names);
+
+  struct WorkerGroup {
+    // The worker name. (Not owned.)
+    const string* name;
+
+    // The worker referenced by name. (Not owned.)
+    WorkerInterface* worker = nullptr;
+
+    // Request and responses used for a given worker.
+    CreateWorkerSessionRequest request;
+    CreateWorkerSessionResponse response;
+    Status status = Status::OK();
+  };
+  BlockingCounter done(worker_names.size());
+  std::vector<WorkerGroup> workers(worker_names.size());
+
+  // Release the workers.
+  auto cleanup = gtl::MakeCleanup([this, &workers] {
+    for (auto&& worker_group : workers) {
+      if (worker_group.worker != nullptr) {
+        worker_cache_->ReleaseWorker(*worker_group.name, worker_group.worker);
+      }
+    }
+  });
+
+  Status status = Status::OK();
+  // Create all the workers & kick off the computations.
+  for (size_t i = 0; i < worker_names.size(); ++i) {
+    workers[i].name = &worker_names[i];
+    workers[i].worker = worker_cache_->CreateWorker(worker_names[i]);
+    workers[i].request.set_session_handle(handle_);
+    *workers[i].request.mutable_server_def()->mutable_cluster() =
+        *options.cluster_def;
+    workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+
+    DeviceNameUtils::ParsedName name;
+    if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
+      status = errors::Internal("Could not parse name ", worker_names[i]);
+      LOG(WARNING) << status;
+      return status;
+    }
+    if (!name.has_job || !name.has_task) {
+      status = errors::Internal("Incomplete worker name ", worker_names[i]);
+      LOG(WARNING) << status;
+      return status;
+    }
+
+    workers[i].request.mutable_server_def()->set_job_name(name.job);
+    workers[i].request.mutable_server_def()->set_task_index(name.task);
+  }
+
+  for (size_t i = 0; i < worker_names.size(); ++i) {
+    auto cb = [i, &workers, &done](const Status& s) {
+      workers[i].status = s;
+      done.DecrementCount();
+    };
+    workers[i].worker->CreateWorkerSessionAsync(&workers[i].request,
+                                                &workers[i].response, cb);
+  }
+
+  done.Wait();
+  for (size_t i = 0; i < workers.size(); ++i) {
+    status.Update(workers[i].status);
+  }
+  return status;
+}
+
 Status MasterSession::Extend(const ExtendSessionRequest* req,
                              ExtendSessionResponse* resp) {
   UpdateLastAccessTime();
@@ -1040,6 +1131,13 @@ Status MasterSession::Extend(const ExtendSessionRequest* req,
   return Status::OK();
 }
 
+WorkerCacheInterface* MasterSession::get_worker_cache() const {
+  if (worker_cache_) {
+    return worker_cache_.get();
+  }
+  return env_->worker_cache;
+}
+
 Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
                                 ReffedClientGraph** rcg, bool is_partial) {
   const uint64 hash = HashBuildGraphOptions(opts);
@@ -1063,10 +1161,11 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
               << "\n";
       std::unique_ptr<SimpleClientGraph> client_graph;
       TF_RETURN_IF_ERROR(execution_state_->BuildGraph(opts, &client_graph));
+      WorkerCacheInterface* worker_cache = get_worker_cache();
       auto entry = new ReffedClientGraph(
           handle_, opts, std::move(client_graph), session_opts_,
           stats_publisher_factory_, execution_state_.get(), is_partial,
-          env_->worker_cache);
+          worker_cache);
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
@@ -1141,6 +1240,8 @@ Status MasterSession::Run(CallOptions* opts, const RunStepRequestWrapper& req,
       return errors::FailedPrecondition("Session is closed.");
     }
     ++num_running_;
+    // Note: all code paths must eventually call MarkRunCompletion()
+    // in order to appropriate decrement the num_running_ counter.
   }
   Status status;
   if (!req.partial_run_handle().empty()) {
@@ -1148,16 +1249,18 @@ Status MasterSession::Run(CallOptions* opts, const RunStepRequestWrapper& req,
   } else {
     status = DoRunWithLocalExecution(opts, req, resp);
   }
-  {
-    mutex_lock l(mu_);
-    --num_running_;
-    if (num_running_ == 0) {
-      num_running_is_zero_.notify_all();
-    }
-  }
   return status;
 }
 
+// Decrements num_running_ and broadcasts if num_running_ is zero.
+void MasterSession::MarkRunCompletion() {
+  mutex_lock l(mu_);
+  --num_running_;
+  if (num_running_ == 0) {
+    num_running_is_zero_.notify_all();
+  }
+}
+
 Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   // Registers subgraphs if haven't done so.
   PartitionOptions popts;
@@ -1167,7 +1270,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
     return strings::StrCat(prefix, "_S", next_node_id_++);
   };
   popts.get_incarnation = [this](const string& name) -> int64 {
-    Device* d = devices_.FindDeviceByName(name);
+    Device* d = devices_->FindDeviceByName(name);
     if (d == nullptr) {
       return PartitionOptions::kIllegalIncarnation;
     } else {
@@ -1194,7 +1297,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   }
 
   TF_RETURN_IF_ERROR(
-      rcg->RegisterPartitions(popts, rcg->client_graph()->flib_def->ToProto()));
+      rcg->RegisterPartitions(popts, *rcg->client_graph()->flib_def));
 
   return Status::OK();
 }
@@ -1202,6 +1305,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
 Status MasterSession::DoPartialRun(CallOptions* opts,
                                    const RunStepRequestWrapper& req,
                                    MutableRunStepResponseWrapper* resp) {
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
   const string& prun_handle = req.partial_run_handle();
   RunState* run_state = nullptr;
   {
@@ -1247,11 +1351,14 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
 
   // Make sure that this is a new set of feeds that are still pending.
   for (size_t i = 0; i < req.num_feeds(); ++i) {
-    auto it = run_state->pending_inputs.find(req.feed_name(i));
+    const string& feed = req.feed_name(i);
+    auto it = run_state->pending_inputs.find(feed);
     if (it == run_state->pending_inputs.end()) {
       return errors::InvalidArgument(
-          "The feed ", req.feed_name(i),
-          " has already been fed or was not specified in partial_run_setup.");
+          "The feed ", feed, " was not specified in partial_run_setup.");
+    } else if (it->second) {
+      return errors::InvalidArgument("The feed ", feed,
+                                     " has already been fed.");
     }
   }
   // Check that this is a new set of fetches that are still pending.
@@ -1259,9 +1366,11 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     const string& fetch = req.fetch_name(i);
     auto it = run_state->pending_outputs.find(fetch);
     if (it == run_state->pending_outputs.end()) {
+      return errors::InvalidArgument(
+          "The fetch ", fetch, " was not specified in partial_run_setup.");
+    } else if (it->second) {
       return errors::InvalidArgument("The fetch ", fetch,
-                                     " had already been fetched or was not "
-                                     "specified in partial_run_setup.");
+                                     " has already been fetched.");
     }
   }
 
@@ -1274,13 +1383,14 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
 
   // Determine if this partial run satisfies all the pending inputs and ouputs.
   for (size_t i = 0; i < req.num_feeds(); ++i) {
-    run_state->pending_inputs.erase(req.feed_name(i));
+    auto it = run_state->pending_inputs.find(req.feed_name(i));
+    it->second = true;
   }
   for (size_t i = 0; i < req.num_fetches(); ++i) {
-    run_state->pending_outputs.erase(req.fetch_name(i));
+    auto it = run_state->pending_outputs.find(req.fetch_name(i));
+    it->second = true;
   }
-  bool is_last_partial_run =
-      (run_state->pending_inputs.empty() && run_state->pending_outputs.empty());
+  bool is_last_partial_run = run_state->PendingDone();
 
   Status s = run_state->rcg->RunPartitions(
       env_, run_state->step_id, run_state->count, &run_state->pss, opts, req,
@@ -1294,12 +1404,14 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     rcg->Ref();
     rcg->ProcessStats(run_state->step_id, &run_state->pss, run_state->ph.get(),
                       req.options(), resp->mutable_metadata());
+    cleanup.release();  // MarkRunCompletion called in done closure.
     rcg->CleanupPartitionsAsync(
         run_state->step_id, [this, rcg, prun_handle](const Status& s) {
           if (!s.ok()) {
             LOG(ERROR) << "Cleanup partition error: " << s;
           }
           rcg->Unref();
+          MarkRunCompletion();
         });
     mutex_lock l(mu_);
     partial_runs_.erase(prun_handle);
@@ -1307,13 +1419,44 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
   return s;
 }
 
+Status MasterSession::CreateDebuggerState(
+    const DebugOptions& debug_options, const RunStepRequestWrapper& req,
+    int64 rcg_execution_count,
+    std::unique_ptr<DebuggerStateInterface>* debugger_state) {
+  TF_RETURN_IF_ERROR(
+      DebuggerStateRegistry::CreateState(debug_options, debugger_state));
+
+  std::vector<string> input_names;
+  for (size_t i = 0; i < req.num_feeds(); ++i) {
+    input_names.push_back(req.feed_name(i));
+  }
+  std::vector<string> output_names;
+  for (size_t i = 0; i < req.num_fetches(); ++i) {
+    output_names.push_back(req.fetch_name(i));
+  }
+  std::vector<string> target_names;
+  for (size_t i = 0; i < req.num_targets(); ++i) {
+    target_names.push_back(req.target_name(i));
+  }
+
+  // TODO(cais): We currently use -1 as a dummy value for session run count.
+  // While this counter value is straightforward to define and obtain for
+  // DirectSessions, it is less so for non-direct Sessions. Devise a better
+  // way to get its value when the need arises.
+  TF_RETURN_IF_ERROR(debugger_state->get()->PublishDebugMetadata(
+      debug_options.global_step(), -1, rcg_execution_count, input_names,
+      output_names, target_names));
+
+  return Status::OK();
+}
+
 Status MasterSession::DoRunWithLocalExecution(
     CallOptions* opts, const RunStepRequestWrapper& req,
     MutableRunStepResponseWrapper* resp) {
-  VLOG(2) << "DoRunWithLocalExecution "
-          << "req: " << req.DebugString();
+  VLOG(2) << "DoRunWithLocalExecution req: " << req.DebugString();
   PerStepState pss;
   pss.start_micros = Env::Default()->NowMicros();
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
 
   // Prepare.
   BuildGraphOptions bgopts;
@@ -1325,6 +1468,13 @@ Status MasterSession::DoRunWithLocalExecution(
   // Unref "rcg" when out of scope.
   core::ScopedUnref unref(rcg);
 
+  std::unique_ptr<DebuggerStateInterface> debugger_state;
+  const DebugOptions& debug_options = req.options().debug_options();
+
+  if (!debug_options.debug_tensor_watch_opts().empty()) {
+    TF_RETURN_IF_ERROR(
+        CreateDebuggerState(debug_options, req, count, &debugger_state));
+  }
   TF_RETURN_IF_ERROR(BuildAndRegisterPartitions(rcg));
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
@@ -1373,11 +1523,13 @@ Status MasterSession::DoRunWithLocalExecution(
     }
   }
   rcg->Ref();
-  rcg->CleanupPartitionsAsync(step_id, [rcg](const Status& s) {
+  cleanup.release();  // MarkRunCompletion called in done closure.
+  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
     if (!s.ok()) {
       LOG(ERROR) << "Cleanup partition error: " << s;
     }
     rcg->Unref();
+    MarkRunCompletion();
   });
   return s;
 }
@@ -1418,10 +1570,10 @@ MasterSession::RunState::RunState(const std::vector<string>& input_names,
     : rcg(rcg), step_id(step_id), count(count) {
   // Initially all the feeds and fetches are pending.
   for (auto& name : input_names) {
-    pending_inputs.emplace(name);
+    pending_inputs[name] = false;
   }
   for (auto& name : output_names) {
-    pending_outputs.emplace(name);
+    pending_outputs[name] = false;
   }
 }
 
@@ -1429,4 +1581,14 @@ MasterSession::RunState::~RunState() {
   if (rcg) rcg->Unref();
 }
 
+bool MasterSession::RunState::PendingDone() const {
+  for (const auto& it : pending_inputs) {
+    if (!it.second) return false;
+  }
+  for (const auto& it : pending_outputs) {
+    if (!it.second) return false;
+  }
+  return true;
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index c593f84f0327b0cdb7b7bd51b102dfeacf781c88..3acc5bc5f0ae79a6bcdccd45bb20d14d5f49451f 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -19,12 +19,14 @@ limitations under the License.
 #include <atomic>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/simple_graph_execution_state.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/master.pb.h"
@@ -48,13 +50,15 @@ class MasterSession : public core::RefCounted {
   MasterSession(
       const SessionOptions& options, const MasterEnv* env,
       std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+      std::unique_ptr<WorkerCacheInterface> worker_cache,
+      std::unique_ptr<DeviceSet> device_set,
       StatsPublisherFactory stats_publisher_factory);
 
   // Initialize the MasterSession for "def".  Must be called before Extend(),
   // Run(), or Close().
   //
   // After this method returns, `def` will no longer be valid.
-  Status Create(GraphDef* def);
+  Status Create(GraphDef* def, const WorkerCacheFactoryOptions& options);
 
   // Returns the session handle.
   const string& handle() const { return handle_; }
@@ -106,8 +110,14 @@ class MasterSession : public core::RefCounted {
 
   std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs_;
 
+  // The optional session-specific worker cluster.
+  // TODO(saeta): Convert to std::optional when available.
+  std::unique_ptr<WorkerCacheInterface> worker_cache_;
+  // Retrieves either worker_cache_ or the env_->worker_cache as appropriate.
+  WorkerCacheInterface* get_worker_cache() const;
+
   // The device set used by this session.
-  DeviceSet devices_;
+  std::unique_ptr<DeviceSet> devices_;
 
   StatsPublisherFactory stats_publisher_factory_;
 
@@ -141,8 +151,8 @@ class MasterSession : public core::RefCounted {
   };
 
   struct RunState {
-    std::unordered_set<string> pending_inputs;
-    std::unordered_set<string> pending_outputs;
+    std::unordered_map<string, bool> pending_inputs;   // true if fed
+    std::unordered_map<string, bool> pending_outputs;  // true if fetched
     ReffedClientGraph* rcg = nullptr;
     uint64 step_id;
     int64 count = 0;
@@ -154,6 +164,8 @@ class MasterSession : public core::RefCounted {
              const std::vector<string>& output_names, ReffedClientGraph* rcg,
              const uint64 step_id, const int64 count);
 
+    bool PendingDone() const;
+
     ~RunState();
   };
   std::unordered_map<string, std::unique_ptr<RunState>> partial_runs_
@@ -178,6 +190,13 @@ class MasterSession : public core::RefCounted {
   // Private dtor. The client must call Close().
   virtual ~MasterSession();
 
+  // Creates sessions on all workers.
+  //
+  // If this session is operating using the new ClusterSpec propagation behavior
+  // call this method in order to propagate the cluster membership to all
+  // workers.
+  Status CreateWorkerSessions(const WorkerCacheFactoryOptions& server_def);
+
   Status StartStep(const BuildGraphOptions& opts, int64* count,
                    ReffedClientGraph** graph, bool is_partial);
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
@@ -187,10 +206,16 @@ class MasterSession : public core::RefCounted {
                                  MutableRunStepResponseWrapper* resp);
   Status DoPartialRun(CallOptions* opts, const RunStepRequestWrapper& req,
                       MutableRunStepResponseWrapper* resp);
+  void MarkRunCompletion();
   void UpdateLastAccessTime();
 
   Status BuildAndRegisterPartitions(ReffedClientGraph* rcg);
 
+  Status CreateDebuggerState(
+      const DebugOptions& debug_options, const RunStepRequestWrapper& req,
+      int64 rcg_execution_count,
+      std::unique_ptr<DebuggerStateInterface>* debugger_state);
+
   TF_DISALLOW_COPY_AND_ASSIGN(MasterSession);
 };
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 7b58feb93cc0c39badb03cd20bcdb0d7811ee0f0..f3bab589a19f44cd976b9acf2f1fa3eba8cae8ee 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -252,6 +252,14 @@ string ProtoRunStepRequest::DebugString() const {
 
 const RunStepRequest& ProtoRunStepRequest::ToProto() const { return *request_; }
 
+const string& InMemoryRunGraphRequest::session_handle() const {
+  return session_handle_;
+}
+
+void InMemoryRunGraphRequest::set_session_handle(const string& handle) {
+  session_handle_ = handle;
+}
+
 const string& InMemoryRunGraphRequest::graph_handle() const {
   return graph_handle_;
 }
@@ -320,6 +328,7 @@ void InMemoryRunGraphRequest::set_is_last_partial_run(
 const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   if (!proto_version_) {
     proto_version_.reset(new RunGraphRequest);
+    proto_version_->set_session_handle(session_handle());
     proto_version_->set_graph_handle(graph_handle());
     proto_version_->set_step_id(step_id());
     *proto_version_->mutable_exec_opts() = exec_opts();
@@ -337,6 +346,14 @@ const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   return *proto_version_;
 }
 
+const string& MutableProtoRunGraphRequest::session_handle() const {
+  return request_.session_handle();
+}
+
+void MutableProtoRunGraphRequest::set_session_handle(const string& handle) {
+  request_.set_session_handle(handle);
+}
+
 const string& MutableProtoRunGraphRequest::graph_handle() const {
   return request_.graph_handle();
 }
@@ -423,6 +440,10 @@ const RunGraphRequest& MutableProtoRunGraphRequest::ToProto() const {
 ProtoRunGraphRequest::ProtoRunGraphRequest(const RunGraphRequest* request)
     : request_(request) {}
 
+const string& ProtoRunGraphRequest::session_handle() const {
+  return request_->session_handle();
+}
+
 const string& ProtoRunGraphRequest::graph_handle() const {
   return request_->graph_handle();
 }
@@ -495,6 +516,7 @@ CostGraphDef* InMemoryRunGraphResponse::mutable_cost_graph() {
 
 RunGraphResponse* InMemoryRunGraphResponse::get_proto() {
   LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunGraphResponse";
+  return NULL;
 }
 
 size_t OwnedProtoRunGraphResponse::num_recvs() const {
@@ -613,6 +635,7 @@ RunMetadata* InMemoryRunStepResponse::mutable_metadata() { return &metadata_; }
 
 RunStepResponse* InMemoryRunStepResponse::get_proto() {
   LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunStepResponse";
+  return NULL;
 }
 
 size_t OwnedProtoRunStepResponse::num_tensors() const {
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 02516eabb4a990a53563d63a4c297fb958b482e8..795a6add0e794ccaa902195828c75cf653565eb9 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -223,6 +223,10 @@ class RunGraphRequestWrapper {
  public:
   virtual ~RunGraphRequestWrapper() {}
 
+  // The session handle used to register the graph. If empty, a single global
+  // namespace is used.
+  virtual const string& session_handle() const = 0;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   virtual const string& graph_handle() const = 0;
@@ -262,6 +266,7 @@ class RunGraphRequestWrapper {
 // See `RunGraphRequestWrapper` above for a description of the fields.
 class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
  public:
+  virtual void set_session_handle(const string& handle) = 0;
   virtual void set_graph_handle(const string& handle) = 0;
   virtual void set_step_id(int64 step_id) = 0;
   virtual ExecutorOpts* mutable_exec_opts() = 0;
@@ -280,6 +285,7 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
 class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
@@ -293,6 +299,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   const RunGraphRequest& ToProto() const override;
 
   // MutableRunGraphRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -304,6 +311,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   void set_is_last_partial_run(bool is_last_partial_run) override;
 
  private:
+  string session_handle_;
   string graph_handle_;
   int64 step_id_;
   ExecutorOpts exec_opts_;
@@ -325,6 +333,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
@@ -338,6 +347,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   const RunGraphRequest& ToProto() const override;
 
   // MutableRunGraphRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -357,6 +367,7 @@ class ProtoRunGraphRequest : public RunGraphRequestWrapper {
   ProtoRunGraphRequest(const RunGraphRequest* request);
 
   // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index 9632e9c439879b3258e3899141b3da5f5c83c07c..91c1fb99fef91c9fd484ddf5fa68476f5d54d523 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/remote_device.h"
 
 #include <vector>
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -43,8 +45,7 @@ string GetLocalDeviceName(StringPiece fullname) {
 class RemoteDevice : public Device {
  public:
   RemoteDevice(Env* env, const DeviceAttributes& da)
-      : Device(env, da, nullptr),
-        local_dev_name_(GetLocalDeviceName(da.name())) {}
+      : Device(env, da), local_dev_name_(GetLocalDeviceName(da.name())) {}
 
   Status Sync() override { return Status::OK(); }
   Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
@@ -68,18 +69,50 @@ void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
     GetStatusResponse resp;
   };
   Call* call = new Call;
-  auto cb = [env, worker_cache, worker_name, done, wi, call](const Status& s) {
+  auto cb = [env, worker_cache, worker_name, done, wi,
+             call](const Status& status) {
+    Status s = status;
     std::vector<Device*> remote_devices;
+    auto cleanup = gtl::MakeCleanup(
+        [&worker_cache, &worker_name, &wi, &done, &remote_devices, &s, call] {
+          worker_cache->ReleaseWorker(worker_name, wi);
+          done(s, &remote_devices);
+          delete call;
+        });
     if (s.ok()) {
+      DeviceNameUtils::ParsedName worker_name_parsed;
+      if (!DeviceNameUtils::ParseFullName(worker_name, &worker_name_parsed) ||
+          !worker_name_parsed.has_job || !worker_name_parsed.has_replica ||
+          !worker_name_parsed.has_task) {
+        s = errors::InvalidArgument("Could not parse worker name: ",
+                                    worker_name);
+        LOG(WARNING) << s;
+        return;
+      }
       remote_devices.reserve(call->resp.device_attributes_size());
       for (const DeviceAttributes& da : call->resp.device_attributes()) {
-        auto d = new RemoteDevice(env, da);
-        remote_devices.push_back(d);
+        DeviceNameUtils::ParsedName device_name_parsed;
+        CHECK(DeviceNameUtils::ParseFullName(da.name(), &device_name_parsed))
+            << "Device attribute name '" << da.name() << "' could not be "
+            << "parsed. Device Attribute: " << da.DebugString();
+        // Preserve the exact name, if possible.
+        // TODO(b/37868888): Simplify when legacy device name formats removed.
+        if (device_name_parsed.job == worker_name_parsed.job &&
+            device_name_parsed.replica == worker_name_parsed.replica &&
+            device_name_parsed.task == worker_name_parsed.task) {
+          auto d = new RemoteDevice(env, da);
+          remote_devices.push_back(d);
+        } else {
+          DeviceAttributes da_rewritten = da;
+          da_rewritten.set_name(DeviceNameUtils::FullName(
+              worker_name_parsed.job, worker_name_parsed.replica,
+              worker_name_parsed.task, device_name_parsed.type,
+              device_name_parsed.id));
+          auto d = new RemoteDevice(env, da_rewritten);
+          remote_devices.push_back(d);
+        }
       }
     }
-    worker_cache->ReleaseWorker(worker_name, wi);
-    done(s, &remote_devices);
-    delete call;
   };
   wi->GetStatusAsync(&call->req, &call->resp, cb);
 }
diff --git a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
index 04c1fc248ef2a1c76319093fa1f147c139b711dc..43267d4362fac45624962229753ceb766c88eb95 100644
--- a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
+++ b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
@@ -25,6 +25,23 @@ limitations under the License.
 
 namespace tensorflow {
 
+struct WorkerSession;
+
+// RemoteRendezvous follow a 2-part initialization. First the objects are
+// constructed. Eventually, they will be initialized. Clients of the
+// RendezvousMgrInterface must guarantee to call Initialize on the returned
+// RemoteRendezvous eventually.
+//
+// Partially initialized RemoteRendezvous must respect the Rendezvous interface
+// (i.e. Send() must never block), however implementations are not expected to
+// actually perform the underlying operations until after the RemoteRendezvous
+// has been Initialize'd.
+class RemoteRendezvous : public Rendezvous {
+ public:
+  // Fully construct the RemoteRendezvous.
+  virtual Status Initialize(WorkerSession* session) = 0;
+};
+
 // RendezvousMgr keeps track of a set of local rendezvous instances.
 // All tensors sent by this worker are buffered in a RendezvousMgr
 // until the tensor is received.  Each global unique "step_id"
@@ -51,7 +68,10 @@ class RendezvousMgrInterface {
   // Returns Rendezvous supporting send and recv among workers in the
   // "step_id".  The caller takes ownership of one reference on the
   // returned Rendezvous instance.
-  virtual Rendezvous* Find(int64 step_id) = 0;
+  //
+  // Note: the caller must guarantee to eventually call Initialize on the
+  // returned RemoteRendezvous
+  virtual RemoteRendezvous* Find(int64 step_id) = 0;
 
   // Finds the local rendezvous instance for the "step_id".  Runs
   // "done" when the tensor for "key" is produced or an error occurs.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index c3b76ed31bcf841b90b3afa8e215db36a40b2c5b..bf72d9a7fcdb5e027be968e94c85970b6b127c14 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h"
 
+#include <utility>
+
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/master_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h"
@@ -29,7 +31,7 @@ namespace tensorflow {
 // that uses gRPC to talk to the Master service.
 class GrpcRemoteMaster : public MasterInterface {
  public:
-  explicit GrpcRemoteMaster(SharedGrpcChannelPtr client_channel)
+  explicit GrpcRemoteMaster(const SharedGrpcChannelPtr& client_channel)
       : stub_(grpc::MasterService::NewStub(client_channel)) {}
 
   ~GrpcRemoteMaster() override {}
@@ -106,7 +108,7 @@ class GrpcRemoteMaster : public MasterInterface {
   }
 };
 
-MasterInterface* NewGrpcMaster(SharedGrpcChannelPtr channel) {
+MasterInterface* NewGrpcMaster(const SharedGrpcChannelPtr& channel) {
   return new GrpcRemoteMaster(channel);
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
index 881a6b10e30eb570d8fe3ae790873bf9c87e37e7..d661caaa6029dc29c9eb8983c009f232fb2b3cbf 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 // Returns a MasterInterface wrapped around the gRPC channel `channel`.
-MasterInterface* NewGrpcMaster(SharedGrpcChannelPtr channel);
+MasterInterface* NewGrpcMaster(const SharedGrpcChannelPtr& channel);
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 36626e1a33f03e17a7e34b63993f25eece2b647c..2b1a47a93f906c3341c535105ad97578b45d209c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h"
 
+#include <utility>
+
 #include "grpc++/grpc++.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
@@ -37,7 +39,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
                             WorkerCacheLogger* logger)
-      : channel_(channel),
+      : channel_(std::move(channel)),
         cq_(completion_queue),
         getstatus_(Method(GrpcWorkerMethod::kGetStatus)),
         createworkersession_(Method(GrpcWorkerMethod::kCreateWorkerSession)),
@@ -272,7 +274,7 @@ class GrpcRemoteWorker : public WorkerInterface {
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      WorkerCacheLogger* logger) {
-  return new GrpcRemoteWorker(channel, completion_queue, logger);
+  return new GrpcRemoteWorker(std::move(channel), completion_queue, logger);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 7160962b168a392796a9f2b57583a3e750384e9e..3867dd1f4d025ac2ae4529aae48afb6aedd36a1f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -63,10 +63,8 @@ class NoReusePortOption : public ::grpc::ServerBuilderOption {
 };
 
 // static utility function
-RendezvousMgrInterface* NewRpcRendezvousMgr(
-    const WorkerEnv* env, const string& worker_name,
-    WorkerCacheInterface* worker_cache) {
-  return new RpcRendezvousMgr(env, worker_name, worker_cache);
+RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) {
+  return new RpcRendezvousMgr(env);
 }
 
 }  // namespace
@@ -84,6 +82,9 @@ GrpcServer::~GrpcServer() {
   // TODO(mrry): Refactor the *Env classes so that it is less fiddly
   // to destroy them.
 
+  // Shut down all outstanding rendezvous.
+  delete worker_env_.rendezvous_mgr;
+
   // We must delete graph_mgr before device_mgr, due to shared
   // ownership of OpKernels in the executors. (The graph_mgr will
   // free all stateless OpKernels, and pass over borrowed stateful
@@ -91,8 +92,10 @@ GrpcServer::~GrpcServer() {
   // OpSegments.)
   if (worker_env_.session_mgr != nullptr) {
     delete worker_env_.session_mgr;  // Deletes graph_mgr's.
+  } else {
+    // Note: session_mgr's legacy_session_ deletes device_mgr now.
+    delete worker_env_.device_mgr;
   }
-  delete worker_env_.device_mgr;
 
   // Do not delete (as these are not owned by the server):
   // - master_env_.env
@@ -100,8 +103,9 @@ GrpcServer::~GrpcServer() {
   // - worker_env_.compute_pool
 }
 
-Status GrpcServer::Init(ServiceInitFunction service_func,
-                        RendezvousMgrCreationFunction rendevous_mgr_func) {
+Status GrpcServer::Init(
+    ServiceInitFunction service_func,
+    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
   mutex_lock l(mu_);
   CHECK_EQ(state_, NEW);
   master_env_.env = env_;
@@ -117,7 +121,11 @@ Status GrpcServer::Init(ServiceInitFunction service_func,
                       "/task:", server_def_.task_index());
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(sess_opts, name_prefix,
                                                &master_env_.local_devices));
-  worker_env_.device_mgr = new DeviceMgr(master_env_.local_devices);
+  worker_env_.local_devices = master_env_.local_devices;
+  worker_env_.device_mgr = new DeviceMgr(worker_env_.local_devices);
+  worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
+                                   ? new RpcRendezvousMgr(&worker_env_)
+                                   : rendezvous_mgr_func(&worker_env_);
   string unused;
   string default_worker_name;
   if (!DeviceNameUtils::SplitDeviceName(master_env_.local_devices[0]->name(),
@@ -189,20 +197,18 @@ Status GrpcServer::Init(ServiceInitFunction service_func,
   }
 
   WorkerCacheInterface* worker_cache;
-  TF_RETURN_IF_ERROR(WorkerCacheFactory(server_def_, &worker_cache));
+  WorkerCacheFactoryOptions worker_cache_factory_options(server_def_);
+  TF_RETURN_IF_ERROR(
+      WorkerCacheFactory(worker_cache_factory_options, &worker_cache));
   CHECK_NE(nullptr, worker_cache);
 
   // Set up worker environment.
-  std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr(
-      rendevous_mgr_func == nullptr ?
-      new RpcRendezvousMgr(&worker_env_, name_prefix, worker_cache) :
-      rendevous_mgr_func(&worker_env_, name_prefix, worker_cache));
   worker_env_.session_mgr = new SessionMgr(
       &worker_env_, SessionMgr::WorkerNameFromServerDef(server_def_),
       std::unique_ptr<WorkerCacheInterface>(worker_cache),
-      std::move(rendezvous_mgr),
       [this](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
-        return WorkerCacheFactory(server_def, worker_cache);
+        WorkerCacheFactoryOptions options(server_def);
+        return WorkerCacheFactory(options, worker_cache);
       });
   worker_env_.compute_pool = ComputePool(sess_opts);
 
@@ -212,11 +218,19 @@ Status GrpcServer::Init(ServiceInitFunction service_func,
   master_env_.master_session_factory =
       [config](
           SessionOptions options, const MasterEnv* env,
-          std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs) {
+          std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+          std::unique_ptr<WorkerCacheInterface> worker_cache,
+          std::unique_ptr<DeviceSet> device_set) {
         options.config.MergeFrom(config);
         return new MasterSession(options, env, std::move(remote_devs),
+                                 std::move(worker_cache), std::move(device_set),
                                  CreateNoOpStatsPublisher);
       };
+  master_env_.worker_cache_factory =
+      [this](const WorkerCacheFactoryOptions& options,
+             WorkerCacheInterface** worker_cache) {
+        return WorkerCacheFactory(options, worker_cache);
+      };
 
   // Provide direct access to the master from in-process clients.
   LocalMaster::Register(target(), master_impl_.get(),
@@ -225,13 +239,11 @@ Status GrpcServer::Init(ServiceInitFunction service_func,
   return Status::OK();
 }
 
-Status GrpcServer::Init() {
-  return Init(nullptr, nullptr);
-}
+Status GrpcServer::Init() { return Init(nullptr, nullptr); }
 
-Status GrpcServer::ParseChannelSpec(const ServerDef& server_def,
+Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                     GrpcChannelSpec* channel_spec) {
-  for (const auto& job : server_def.cluster().job()) {
+  for (const auto& job : options.cluster_def->job()) {
     std::map<int, string> host_ports;
     for (const auto& task : job.tasks()) {
       string& host_port = host_ports[task.first];
@@ -241,8 +253,7 @@ Status GrpcServer::ParseChannelSpec(const ServerDef& server_def,
                                        task.first, "\": ", host_port, " and ",
                                        task.second);
       }
-      if (job.name() == server_def.job_name() &&
-          task.first == server_def.task_index()) {
+      if (job.name() == *options.job_name && task.first == options.task_index) {
         host_port = strings::StrCat("localhost:", bound_port_);
       } else {
         host_port = task.second;
@@ -253,17 +264,26 @@ Status GrpcServer::ParseChannelSpec(const ServerDef& server_def,
   return Status::OK();
 }
 
-Status GrpcServer::WorkerCacheFactory(const ServerDef& server_def,
+Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
                                       WorkerCacheInterface** worker_cache) {
-  string name_prefix =
-      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
-                      "/task:", server_def.task_index());
+  if (options.job_name == nullptr || options.job_name->empty()) {
+    Status s = errors::InvalidArgument(
+        "The master (current machine) is not included in the provided "
+        "cluster_def. ",
+        options.cluster_def->DebugString());
+    LOG(WARNING) << s;
+    return s;
+  }
 
   GrpcChannelSpec channel_spec;
-  TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
+  TF_RETURN_IF_ERROR(ParseChannelSpec(options, &channel_spec));
+
+  std::unique_ptr<GrpcChannelCache> channel_cache(
+      NewGrpcChannelCache(channel_spec, GetChannelCreationFunction()));
+
+  string name_prefix = strings::StrCat("/job:", *options.job_name, "/replica:0",
+                                       "/task:", options.task_index);
 
-  std::unique_ptr<GrpcChannelCache> channel_cache(NewGrpcChannelCache(
-      channel_spec, GetChannelCreationFunction(server_def)));
   const string host_port = channel_cache->TranslateTask(name_prefix);
   int requested_port;
 
@@ -349,8 +369,7 @@ std::shared_ptr<::grpc::ServerCredentials> GrpcServer::GetServerCredentials(
   return ::grpc::InsecureServerCredentials();
 }
 
-ChannelCreationFunction GrpcServer::GetChannelCreationFunction(
-    const ServerDef& server_def) const {
+ChannelCreationFunction GrpcServer::GetChannelCreationFunction() const {
   // We can do this because SparseGrpcChannelCache is robust to nullptr being
   // returned by the channel creation function
   return ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 3b66291a9ab5fb6974fcb17b0d5413de5944e3a6..7b54bb84c88fe0f6669a5fd63744722c0d5231b7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -37,9 +37,7 @@ class GrpcWorker;
 class Master;
 
 // function that creates a RendezvousMgr.
-typedef std::function<RendezvousMgrInterface*(
-    const WorkerEnv*, const std::string& worker_name,
-    WorkerCacheInterface* worker_cache)>
+typedef std::function<RendezvousMgrInterface*(const WorkerEnv*)>
     RendezvousMgrCreationFunction;
 
 // function that registers a service to the server. The service needs to
@@ -67,7 +65,7 @@ class GrpcServer : public ServerInterface {
 
  protected:
   Status Init(ServiceInitFunction service_func,
-              RendezvousMgrCreationFunction rendezvous_mgr_func);
+              const RendezvousMgrCreationFunction& rendezvous_mgr_func);
 
   Status Init();
 
@@ -75,17 +73,16 @@ class GrpcServer : public ServerInterface {
   virtual std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials(
       const ServerDef& server_def) const;
 
-  virtual ChannelCreationFunction GetChannelCreationFunction(
-      const ServerDef& server_def) const;
+  virtual ChannelCreationFunction GetChannelCreationFunction() const;
 
   virtual std::unique_ptr<Master> CreateMaster(MasterEnv* master_env);
 
   // Creates a WorkerCacheInterface for a session.
-  Status WorkerCacheFactory(const ServerDef& server_def,
+  Status WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
                             WorkerCacheInterface** worker_cache);
 
-  // Parses a ServerDef into a GrpcChannelSpec.
-  Status ParseChannelSpec(const ServerDef& server_def,
+  // Parses a WorkerCacheFactoryOptions into a GrpcChannelSpec.
+  Status ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                           GrpcChannelSpec* channel_spec);
 
   // Returns the port to which this server is bound.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 1aacef8a26ade2f48f74687992221e1004ba74b3..38d59d5bb59978be6160dd9dcdf9225fd2588d3f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -43,7 +43,7 @@ const size_t kSchemePrefixLength = strlen(kSchemePrefix);
 /* static */
 Status GrpcSession::Create(const SessionOptions& options,
                            std::unique_ptr<GrpcSession>* out_session) {
-  std::unique_ptr<GrpcSession> ret(new GrpcSession(options));
+  std::unique_ptr<GrpcSession> session(new GrpcSession(options));
   std::unique_ptr<MasterInterface> master;
   // For testing, we enable the client to disable the use of the local
   // master registry, so that the RPC stack is exercised.
@@ -56,8 +56,8 @@ Status GrpcSession::Create(const SessionOptions& options,
         options.target.substr(kSchemePrefixLength), &master_channel));
     master.reset(NewGrpcMaster(master_channel));
   }
-  ret->SetRemoteMaster(std::move(master));
-  *out_session = std::move(ret);
+  session->SetRemoteMaster(std::move(master));
+  *out_session = std::move(session);
   return Status::OK();
 }
 
@@ -102,6 +102,7 @@ Status GrpcSession::CreateImpl(CallOptions* call_options,
   CreateSessionRequest req;
   *req.mutable_config() = options_.config;
   *req.mutable_graph_def() = graph;
+  req.set_target(options_.target);
   ReEncodeConsts(req.mutable_graph_def());
   CreateSessionResponse resp;
   Status s = master_->CreateSession(call_options, &req, &resp);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index c6260afa20e7b0a91597fb35e071b9d12a7ed404..90e311a493079526c10c12d44cbeac609bfa6847 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
 #include "grpc++/support/byte_buffer.h"
 #include "grpc++/support/slice.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -27,10 +28,9 @@ namespace tensorflow {
 namespace grpc {
 
 static void do_nothing(void* raw) {}
-static void unref_tensorreference(void* raw) {
-  TensorReference* ref = static_cast<TensorReference*>(raw);
-  ref->Unref();
-  delete ref;
+static void unref_tensorbuffer(void* raw) {
+  TensorBuffer* buf = static_cast<TensorBuffer*>(raw);
+  buf->Unref();
 }
 
 void EncodeRecvTensorResponseToByteBuffer(const RecvTensorResponse& proto,
@@ -166,7 +166,7 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
         (e_skeleton.size() +
          VarLengthEncodingSize(TensorProto::kTensorContentFieldNumber,
                                tdata.size()));
-    string header;  // All of RecvTensorRequest except the tensor() field
+    string header;  // All of RecvTensorResponse except the tensor() field
     response.AppendToString(&header);
 
     size_t expected_size =
@@ -219,8 +219,8 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
 
     if (tensor_data_is_large) {
       // Encode the actual tensor data by pointing to the backing store,
-      // and add a special zero-length slice that is really a TensorReference
-      // object that we will destroy when we are done.
+      // and add a special zero-length slice that is really a TensorBuffer
+      // reference that we will unref when we are done.
       //
       // TODO(jeff): Note that this approach relies on the fact that
       // slices are destroyed in the order in which they are added to
@@ -241,17 +241,15 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
 
       // (E) Encode tensor data, but by sharing backing store
 
-      // TODO(jeff,sanjay): It'd be nice to avoid this TensorReference
-      // allocation, and instead get our hands on the underlying
-      // TensorBuffer object and just directly ref it here and unref
-      // it in unref_tensorreference.
-      TensorReference* ref = new TensorReference(val);
+      const TensorBuffer* buf = DMAHelper::buffer(&val);
+      buf->Ref();
       gpr_slice s1 = gpr_slice_new(
           const_cast<void*>(static_cast<const void*>(tdata.data())),
           tdata.size(), do_nothing);
       slices[1] = ::grpc::Slice(s1, ::grpc::Slice::STEAL_REF);
 
-      gpr_slice s2 = gpr_slice_new(ref, 0, unref_tensorreference);
+      gpr_slice s2 =
+          gpr_slice_new(const_cast<TensorBuffer*>(buf), 0, unref_tensorbuffer);
       slices[2] = ::grpc::Slice(s2, ::grpc::Slice::STEAL_REF);
       num_slices += 2;
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index c11266587d8cba97b742818c85d2ae54f7e32f26..873ef8588f4ffee07df9f8e33a4d6fd8884f36a8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -113,6 +113,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
     // completes, and we may decide to bound some of the request
     // types.
     ENQUEUE_REQUEST(GetStatus, false);
+    ENQUEUE_REQUEST(CreateWorkerSession, false);
     ENQUEUE_REQUEST(CleanupAll, false);
     ENQUEUE_REQUEST(RegisterGraph, false);
     ENQUEUE_REQUEST(DeregisterGraph, false);
@@ -181,6 +182,16 @@ class GrpcWorkerService : public AsyncServiceInterface {
     ENQUEUE_REQUEST(GetStatus, false);
   }
 
+  void CreateWorkerSessionHandler(
+      WorkerCall<CreateWorkerSessionRequest, CreateWorkerSessionResponse>*
+          call) {
+    Schedule([this, call]() {
+      Status s = worker_->CreateWorkerSession(&call->request, &call->response);
+      call->SendResponse(ToGrpcStatus(s));
+    });
+    ENQUEUE_REQUEST(CreateWorkerSession, false);
+  }
+
   void CleanupAllHandler(
       WorkerCall<CleanupAllRequest, CleanupAllResponse>* call) {
     Schedule([this, call]() {
@@ -298,7 +309,6 @@ void GrpcWorker::RecvTensorAsync(CallOptions* opts,
                                  ::grpc::ByteBuffer* response,
                                  StatusCallback done) {
   const int64 step_id = request->step_id();
-  WorkerSession* session = env_->session_mgr->WorkerSessionForStepId(step_id);
   const string& key = request->rendezvous_key();
   TRACEPRINTF("RecvTensor: %lld %s", step_id, key.c_str());
   Rendezvous::ParsedKey parsed;
@@ -317,7 +327,7 @@ void GrpcWorker::RecvTensorAsync(CallOptions* opts,
   // of execution of the callback lambda body below, an RPC
   // cancellation should abort the rendezvous.
   opts->SetCancelCallback([this, step_id]() { AbortStep(step_id); });
-  session->rendezvous_mgr->RecvLocalAsync(
+  env_->rendezvous_mgr->RecvLocalAsync(
       step_id, parsed,
       [opts, response, done, src_dev](const Status& status,
                                       const Rendezvous::Args& send_args,
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 7518a289fdb1855900a5a3bb594a0135b3e959cc..8265100061e4cb0a1a3ea1da96abb5b563f010c8 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -38,9 +38,8 @@ namespace {
 
 class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  public:
-  RpcRemoteRendezvous(const WorkerEnv* env, const string& worker_name,
-                      WorkerCacheInterface* cache, int64 step_id)
-      : BaseRemoteRendezvous(env, worker_name, step_id, false), cache_(cache) {}
+  RpcRemoteRendezvous(const WorkerEnv* env, int64 step_id)
+      : BaseRemoteRendezvous(env, step_id, false) {}
 
  protected:
   void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
@@ -50,7 +49,6 @@ class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  private:
   ~RpcRemoteRendezvous() override {}
 
-  WorkerCacheInterface* const cache_;  // Not owned.
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRemoteRendezvous);
 };
 
@@ -204,75 +202,10 @@ static RpcRecvTensorFreeList* get_call_freelist() {
   return call_freelist;
 }
 
-// A private cache that wraps worker_cache and allows reuse of
-// WorkerInterface objects.
-class WorkerFreeListCache : public WorkerCacheInterface {
- public:
-  explicit WorkerFreeListCache(WorkerCacheInterface* w) : wrapped_(w) {}
-
-  ~WorkerFreeListCache() {
-    for (auto p : workers_) {
-      wrapped_->ReleaseWorker(p.first, p.second.worker);
-    }
-  }
-
-  void ListWorkers(std::vector<string>* workers) const override {
-    wrapped_->ListWorkers(workers);
-  }
-
-  WorkerInterface* CreateWorker(const string& target) override {
-    mutex_lock l(mu_);
-    auto p = workers_.find(target);
-    if (p != workers_.end()) {
-      return p->second.worker;
-    }
-    WorkerState state;
-    state.worker = wrapped_->CreateWorker(target);
-    if (state.worker != nullptr) {
-      workers_.insert(std::make_pair(target, state));
-    }
-    return state.worker;
-  }
-
-  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
-    // TODO(jeff,sanjay): Should decrement ref-count when we implement eviction.
-  }
-
-  bool GetDeviceLocalityNonBlocking(const string& device,
-                                    DeviceLocality* locality) override {
-    return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
-  }
-
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
-                              StatusCallback done) override {
-    wrapped_->GetDeviceLocalityAsync(device, locality, done);
-  }
-
-  void SetLogging(bool active) override { wrapped_->SetLogging(active); }
-
-  void ClearLogs() override { wrapped_->ClearLogs(); }
-
-  bool RetrieveLogs(int64 step_id, StepStats* ss) override {
-    return wrapped_->RetrieveLogs(step_id, ss);
-  }
-
- private:
-  WorkerCacheInterface* wrapped_;
-
-  // Information kept per created WorkerInterface.
-  struct WorkerState {
-    WorkerInterface* worker;
-    // TODO(jeff,sanjay): Add reference count if we support eviction.
-  };
-
-  // TODO(jeff,sanjay): Eviction when the map becomes too big.
-  mutex mu_;
-  std::unordered_map<string, WorkerState> workers_ GUARDED_BY(mu_);
-};
-
 void RpcRemoteRendezvous::RecvFromRemoteAsync(
     const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
     DoneCallback done) {
+  CHECK(is_initialized());
   Status s;
 
   // Prepare a RecvTensor call that can handle being aborted.
@@ -284,17 +217,21 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     s = errors::Internal(parsed.src_device,
                          " is invalid remote source device.");
   }
-  WorkerInterface* rwi = cache_->CreateWorker(call->src_worker_);
+  WorkerSession* sess = session();
+  WorkerInterface* rwi = sess->worker_cache->CreateWorker(call->src_worker_);
   if (s.ok() && rwi == nullptr) {
     s = errors::Internal("No worker known as ", call->src_worker_);
   }
 
   Device* dst_device;
   if (s.ok()) {
-    s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
   }
   if (!s.ok()) {
-    get_call_freelist()->Release(call, cache_);
+    if (rwi != nullptr) {
+      sess->worker_cache->ReleaseWorker(call->src_worker_, rwi);
+    }
+    get_call_freelist()->Release(call, sess->worker_cache.get());
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
@@ -314,26 +251,21 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     // current status should be bad.
     Status s = call->status();
     call->done()(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
-    cache_->ReleaseWorker(call->src_worker_, call->wi_);
+    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
     call->wi_ = nullptr;
-    get_call_freelist()->Release(call, cache_);
+    get_call_freelist()->Release(call, session()->worker_cache.get());
     Unref();
   });
 }
 
 }  // namespace
 
-RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env,
-                                   const string& worker_name,
-                                   WorkerCacheInterface* worker_cache)
-    : BaseRendezvousMgr(env, worker_name),
-      cache_(new WorkerFreeListCache(worker_cache)) {}
+RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env)
+    : BaseRendezvousMgr(env) {}
 
 BaseRemoteRendezvous* RpcRendezvousMgr::Create(int64 step_id,
-                                               const WorkerEnv* worker_env,
-                                               const string& worker_name) {
-  return new RpcRemoteRendezvous(worker_env, worker_name, cache_.get(),
-                                 step_id);
+                                               const WorkerEnv* worker_env) {
+  return new RpcRemoteRendezvous(worker_env, step_id);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
index 75dc62d98fd635cdc797c593a2cd848e5319da57..34c48a79177618679b99ba2b2476b05b3954bffd 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
@@ -17,13 +17,13 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RENDEZVOUS_MGR_H_
 
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
-#include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
+class DeviceMgr;
+
 // RendezvousMgr keeps track of a set of local rendezvous instances.
 // All tensors sent by this worker are buffered in a RendezvousMgr
 // until the tensor is received.  Each global unique "step_id"
@@ -44,17 +44,12 @@ namespace tensorflow {
 // RendezvousMgr must have keys generated by Rendezvous::CreateKey.
 class RpcRendezvousMgr : public BaseRendezvousMgr {
  public:
-  explicit RpcRendezvousMgr(const WorkerEnv* env, const string& worker_name,
-                            WorkerCacheInterface* worker_cache);
+  explicit RpcRendezvousMgr(const WorkerEnv* env);
 
  protected:
-  BaseRemoteRendezvous* Create(int64 step_id, const WorkerEnv* worker_env,
-                               const string& session_name) override;
+  BaseRemoteRendezvous* Create(int64 step_id, const WorkerEnv* worker_env);
 
  private:
-  // Private cache_ that allows us to reuse WorkerInterface objects.
-  std::unique_ptr<WorkerCacheInterface> cache_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRendezvousMgr);
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 9b778eab3a593fe219602ba5d4cf0b04565f6ce4..2d0d76623d4e9b83d101b362b7a2316bc7a8084f 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -68,9 +68,9 @@ class RpcRendezvousMgrTest : public ::testing::Test {
       : cache_(new DummyWorkerCache),
         worker_session_("/job:mnist/replica:1/task:2",
                         std::unique_ptr<WorkerCacheInterface>(cache_),
-                        std::unique_ptr<RendezvousMgrInterface>(),
+                        std::unique_ptr<DeviceMgr>(),
                         std::unique_ptr<GraphMgr>()),
-        rmgr_(&env, worker_session_.worker_name, cache_) {
+        rmgr_(&env) {
     env.env = Env::Default();
   }
 
@@ -87,7 +87,8 @@ TEST_F(RpcRendezvousMgrTest, LocalSendRecv) {
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
@@ -107,7 +108,7 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {  // Explicit Abort().
     const int64 step_id = 123;
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
     core::ScopedUnref unref(rendez);
     SchedClosure([this, rendez]() {
       env.env->SleepForMicroseconds(100 * 1000);
@@ -116,11 +117,12 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
     Tensor val(DT_STRING);
     bool val_dead = false;
     Rendezvous::Args args;
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     EXPECT_TRUE(errors::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
   }
   {  // Cleanup causes Abort().
     const int64 step_id = 321;
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
     core::ScopedUnref unref(rendez);
     SchedClosure([this, step_id]() {
       env.env->SleepForMicroseconds(100 * 1000);
@@ -129,6 +131,7 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
     Tensor val(DT_STRING);
     bool val_dead = false;
     Rendezvous::Args args;
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     EXPECT_TRUE(errors::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
   }
 }
@@ -139,7 +142,8 @@ TEST_F(RpcRendezvousMgrTest, CleanupAll) {
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
     const int64 step_id = 123;
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
@@ -168,10 +172,11 @@ TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
     core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     args.device_context = dc;
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
   }
   {
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index e2be62f816c35a97c669e85310a1845202bf7e7c..22551d54821b0ef34f4e535ee5923d6d695cfdc1 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -17,8 +17,9 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
-#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -26,23 +27,12 @@ namespace tensorflow {
 SessionMgr::SessionMgr(
     WorkerEnv* worker_env, const string& default_worker_name,
     std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-    std::unique_ptr<RendezvousMgrInterface> default_rendezvous_mgr,
-    WorkerCacheFactory worker_cache_factory)
-    : SessionMgr(
-          worker_env, default_worker_name, std::move(default_worker_cache),
-          default_rendezvous_mgr.release(), std::move(worker_cache_factory)) {}
-
-SessionMgr::SessionMgr(
-    WorkerEnv* worker_env, const string& default_worker_name,
-    std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-    RendezvousMgrInterface* default_rendezvous_mgr,
     WorkerCacheFactory worker_cache_factory)
     : worker_env_(worker_env),
-      legacy_session_(
-          default_worker_name, std::move(default_worker_cache),
-          std::unique_ptr<RendezvousMgrInterface>(default_rendezvous_mgr),
-          std::unique_ptr<GraphMgr>(
-              new GraphMgr(worker_env, default_rendezvous_mgr))),
+      legacy_session_(default_worker_name, std::move(default_worker_cache),
+                      std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
+                      std::unique_ptr<GraphMgr>(
+                          new GraphMgr(worker_env, worker_env->device_mgr))),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
@@ -53,20 +43,28 @@ string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
 Status SessionMgr::CreateSession(const string& session,
                                  const ServerDef& server_def) {
   mutex_lock l(mu_);
+  if (session.empty()) {
+    return errors::InvalidArgument("Session must be non-empty.");
+  }
+
   const string worker_name = WorkerNameFromServerDef(server_def);
 
   WorkerCacheInterface* worker_cache = nullptr;
   TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
 
-  std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr(
-      new RpcRendezvousMgr(worker_env_, worker_name, worker_cache));
+  std::vector<Device*> renamed_devices;
+  for (Device* d : worker_env_->local_devices) {
+    renamed_devices.push_back(
+        RenamedDevice::NewRenamedDevice(worker_name, d, false));
+  }
+  std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr(renamed_devices));
 
   std::unique_ptr<GraphMgr> graph_mgr(
-      new GraphMgr(worker_env_, rendezvous_mgr.get()));
+      new GraphMgr(worker_env_, device_mgr.get()));
 
   std::unique_ptr<WorkerSession> worker_session(new WorkerSession(
       worker_name, std::unique_ptr<WorkerCacheInterface>(worker_cache),
-      std::move(rendezvous_mgr), std::move(graph_mgr)));
+      std::move(device_mgr), std::move(graph_mgr)));
 
   sessions_.insert(std::make_pair(session, std::move(worker_session)));
   return Status::OK();
@@ -78,22 +76,6 @@ Status SessionMgr::DeleteSession(const string& session) {
   if (it != sessions_.end()) {
     sessions_.erase(it);
   }
-  std::set<string> graph_handles;
-  for (auto graph_handle_it = sessions_by_graph_handle_.begin();
-       graph_handle_it != sessions_by_graph_handle_.end(); ++graph_handle_it) {
-    if (graph_handle_it->second == session) {
-      graph_handles.insert(graph_handle_it->first);
-      graph_handle_it = sessions_by_graph_handle_.erase(graph_handle_it);
-      if (graph_handle_it == sessions_by_graph_handle_.end()) break;
-    }
-  }
-  for (auto step_id_it = graphs_by_step_id_.begin();
-       step_id_it != graphs_by_step_id_.end(); ++step_id_it) {
-    if (graph_handles.find(step_id_it->second) != graph_handles.end()) {
-      step_id_it = graphs_by_step_id_.erase(step_id_it);
-      if (step_id_it == graphs_by_step_id_.end()) break;
-    }
-  }
   return Status::OK();
 }
 
@@ -114,58 +96,4 @@ WorkerSession* SessionMgr::WorkerSessionForSession(const string& session) {
 
 WorkerSession* SessionMgr::LegacySession() { return &legacy_session_; }
 
-WorkerSession* SessionMgr::WorkerSessionForGraphHandleUnlocked(
-    const string& graph_handle) {
-  auto it = sessions_by_graph_handle_.find(graph_handle);
-  if (it == sessions_by_graph_handle_.end()) {
-    return &legacy_session_;
-  } else {
-    return WorkerSessionForSessionUnlocked(it->second);
-  }
-}
-
-WorkerSession* SessionMgr::WorkerSessionForGraphHandle(
-    const string& graph_handle) {
-  mutex_lock l(mu_);
-  return WorkerSessionForGraphHandleUnlocked(graph_handle);
-}
-
-WorkerSession* SessionMgr::WorkerSessionForStepId(const int64 step_id) {
-  mutex_lock l(mu_);
-  auto it = graphs_by_step_id_.find(step_id);
-  if (it == graphs_by_step_id_.end()) {
-    return &legacy_session_;
-  } else {
-    return WorkerSessionForGraphHandleUnlocked(it->second);
-  }
-}
-
-void SessionMgr::AssociateGraphWithSession(const string& session,
-                                           const string& graph_handle) {
-  mutex_lock l(mu_);
-  sessions_by_graph_handle_[graph_handle] = session;
-}
-
-void SessionMgr::DisassociateGraphFromSession(const string& graph_handle) {
-  mutex_lock l(mu_);
-  auto it = sessions_by_graph_handle_.find(graph_handle);
-  if (it != sessions_by_graph_handle_.end()) {
-    sessions_by_graph_handle_.erase(it);
-  }
-}
-
-void SessionMgr::AssociateStepIdWithGraph(const string& graph_handle,
-                                          const int64 step_id) {
-  mutex_lock l(mu_);
-  graphs_by_step_id_[step_id] = graph_handle;
-}
-
-void SessionMgr::DisassociateStepIdFromGraph(const int64 step_id) {
-  mutex_lock l(mu_);
-  auto it = graphs_by_step_id_.find(step_id);
-  if (it != graphs_by_step_id_.end()) {
-    graphs_by_step_id_.erase(it);
-  }
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 455b5c8d9d9e07f1743dc3f7695e3e70cbc078f0..c44bca7b7a407957b1a36d7659f2b35ea0b30d07 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -30,6 +30,8 @@ struct WorkerEnv;
 
 // SessionMgr keeps track of information related to a given session.
 //
+// SessionMgr runs on the workers.
+//
 // SessionMgr is threadsafe.
 class SessionMgr {
  public:
@@ -39,7 +41,6 @@ class SessionMgr {
   explicit SessionMgr(
       WorkerEnv* worker_env, const string& default_worker_name,
       std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-      std::unique_ptr<RendezvousMgrInterface> default_rendezvous_mgr,
       WorkerCacheFactory worker_cache_factory);
   ~SessionMgr() {}
 
@@ -50,49 +51,36 @@ class SessionMgr {
   WorkerSession* WorkerSessionForSession(const string& session);
   WorkerSession* LegacySession();
 
-  // Locates the worker session for a given graph handle
-  WorkerSession* WorkerSessionForGraphHandle(const string& graph_handle);
-  void AssociateGraphWithSession(const string& session,
-                                 const string& graph_handle);
-  void DisassociateGraphFromSession(const string& graph_handle);
-
-  // Locates a worker session for a given step id
-  WorkerSession* WorkerSessionForStepId(const int64 step_id);
-  void AssociateStepIdWithGraph(const string& graph_handle,
-                                const int64 step_id);
-  void DisassociateStepIdFromGraph(const int64 step_id);
-
   Status DeleteSession(const string& session);
 
   static string WorkerNameFromServerDef(const ServerDef& server_def);
 
  private:
-  // Private constructor to work around std::unique_ptr ownership issues.
-  explicit SessionMgr(
-      WorkerEnv* worker_env, const string& default_worker_name,
-      std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-      RendezvousMgrInterface* default_rendezvous_mgr,
-      WorkerCacheFactory worker_cache_factory);
-
   const WorkerEnv* const worker_env_;  // Not owned.
+
+  // A note about destruction:
+  // We must delete graph_mgr before device_mgr, due to shared
+  // ownership of OpKernels in the executors. (The graph_mgr will
+  // free all stateless OpKernels, and pass over borrowed stateful
+  // OpKernels, which are also held in their respective devices'
+  // OpSegments.)
+  //
+  // legacy_session_ owns the worker_env_.device_mgr, and so we must ensure
+  // that sessions_'s WorkerSessions are deleted (which do not own the
+  // underlying devices, but instead own RenamedDevices) before
+  // legacy_session_ is deleted. Further, we must ensure that WorkerSession's
+  // device_mgr is deleted after WorkerSession's graph_mgr.
+
   WorkerSession legacy_session_;
 
   const WorkerCacheFactory worker_cache_factory_;
 
   WorkerSession* WorkerSessionForSessionUnlocked(const string& session)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
-  WorkerSession* WorkerSessionForGraphHandleUnlocked(const string& graph_handle)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   // A map from session identifier to internal session structure.
   std::map<string, std::unique_ptr<WorkerSession>> sessions_ GUARDED_BY(mu_);
-
-  // A map from graph handles to the session that they belong to.
-  std::map<string, string> sessions_by_graph_handle_ GUARDED_BY(mu_);
-
-  // A map from globally-unique step id's to the corresponding graph handles.
-  std::map<int64, string> graphs_by_step_id_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index d3f3fa83958750f57ccfd61aef5b2f516c582b1d..7132f123a5943d0680743f3cc3bc17470f49d65d 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -27,8 +27,6 @@ class SessionMgrTest : public ::testing::Test {
   SessionMgrTest()
       : mgr_(&env_, "/job:mnist/replica:0/task:0",
              std::unique_ptr<WorkerCacheInterface>(),
-             std::unique_ptr<RendezvousMgrInterface>(new RpcRendezvousMgr(
-                 &env_, "/job:mnist/replica:0/task:0", nullptr)),
              factory_),
         legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {}
 
@@ -48,90 +46,19 @@ TEST_F(SessionMgrTest, CreateSessionSimple) {
   TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
   WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
   EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
-
-  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
-}
-
-TEST_F(SessionMgrTest, AssociateGraphWithSession) {
-  ServerDef server_def;
-  string session_handle = "test_session_handle";
-  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
-  WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
-  ASSERT_NE(nullptr, session) << "Session for " << session_handle << "was null";
-
-  string graph_handle = "test_graph_handle";
-  mgr_.AssociateGraphWithSession(session_handle, graph_handle);
-  WorkerSession* graph_session = mgr_.WorkerSessionForGraphHandle(graph_handle);
-  ASSERT_EQ(session, graph_session);
-
+  EXPECT_NE(mgr_.LegacySession(), session);
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
-TEST_F(SessionMgrTest, AssociateStepWithGraph) {
+TEST_F(SessionMgrTest, LegacySession) {
   ServerDef server_def;
-  string session_handle = "test_session_handle";
-  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
+  string session_handle = "";
   WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
-  ASSERT_NE(nullptr, session) << "Session for " << session_handle << "was null";
-
-  string graph_handle = "test_graph_handle";
-  mgr_.AssociateGraphWithSession(session_handle, graph_handle);
-  WorkerSession* graph_session = mgr_.WorkerSessionForGraphHandle(graph_handle);
-  ASSERT_EQ(session, graph_session);
-
-  int64 step_id = 1234567890L;
-  mgr_.AssociateStepIdWithGraph(graph_handle, step_id);
-  WorkerSession* step_session = mgr_.WorkerSessionForStepId(step_id);
-  ASSERT_EQ(session, step_session);
-  ASSERT_EQ(graph_session, step_session);
+  EXPECT_EQ(mgr_.LegacySession(), session);
 
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
-TEST_F(SessionMgrTest, AssociateGraphWithSession_MissingSession) {
-  string session_handle = "test_session_handle";
-  string graph_handle = "test_graph_handle";
-  mgr_.AssociateGraphWithSession(session_handle, graph_handle);
-  WorkerSession* graph_session = mgr_.WorkerSessionForGraphHandle(graph_handle);
-  ASSERT_EQ(legacy_session_, graph_session);
-}
-
-TEST_F(SessionMgrTest, AssociateStepWithGraph_MissingGraph) {
-  ServerDef server_def;
-  string session_handle = "test_session_handle";
-  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
-  WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
-  ASSERT_NE(nullptr, session) << "Session for " << session_handle << "was null";
-
-  string graph_handle = "test_graph_handle";
-  int64 step_id = 1234567890L;
-  mgr_.AssociateStepIdWithGraph(graph_handle, step_id);
-  WorkerSession* step_session = mgr_.WorkerSessionForStepId(step_id);
-  ASSERT_EQ(legacy_session_, step_session);
-}
-
-TEST_F(SessionMgrTest, AssociateStepWithGraph_MissingSession) {
-  string session_handle = "test_session_handle";
-  string graph_handle = "test_graph_handle";
-  mgr_.AssociateGraphWithSession(session_handle, graph_handle);
-  WorkerSession* graph_session = mgr_.WorkerSessionForGraphHandle(graph_handle);
-  ASSERT_EQ(legacy_session_, graph_session);
-
-  int64 step_id = 1234567890L;
-  mgr_.AssociateStepIdWithGraph(graph_handle, step_id);
-  WorkerSession* step_session = mgr_.WorkerSessionForStepId(step_id);
-  ASSERT_EQ(legacy_session_, step_session);
-}
-
-TEST_F(SessionMgrTest, AssociateStepWithGraph_MissingSessionAndGraph) {
-  string session_handle = "test_session_handle";
-  string graph_handle = "test_graph_handle";
-  int64 step_id = 1234567890L;
-  mgr_.AssociateStepIdWithGraph(graph_handle, step_id);
-  WorkerSession* step_session = mgr_.WorkerSessionForStepId(step_id);
-  ASSERT_EQ(legacy_session_, step_session);
-}
-
 TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
   ServerDef server_def;
   server_def.set_job_name("worker");
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 1aced4443f8058f7376569670af81a5515606c65..07bb17981d32a0a246c7a8c4c7017b966e47df53 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -55,11 +55,7 @@ void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
       env_->session_mgr->WorkerSessionForSession(request->session_handle());
   Status s = session->graph_mgr->Register(
       request->session_handle(), request->graph_def(), request->graph_options(),
-      response->mutable_graph_handle());
-  if (s.ok()) {
-    env_->session_mgr->AssociateGraphWithSession(request->session_handle(),
-                                                 response->graph_handle());
-  }
+      request->debug_options(), response->mutable_graph_handle());
   done(s);
 }
 
@@ -67,9 +63,8 @@ void Worker::DeregisterGraphAsync(const DeregisterGraphRequest* request,
                                   DeregisterGraphResponse* response,
                                   StatusCallback done) {
   WorkerSession* session =
-      env_->session_mgr->WorkerSessionForGraphHandle(request->graph_handle());
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
   Status s = session->graph_mgr->Deregister(request->graph_handle());
-  env_->session_mgr->DisassociateGraphFromSession(request->graph_handle());
 
   done(s);
 }
@@ -141,8 +136,7 @@ void Worker::SetOrCallFinalCallback(const string& graph_handle, int step_id,
 }
 
 void Worker::AbortStep(int64 step_id) {
-  WorkerSession* session = env_->session_mgr->WorkerSessionForStepId(step_id);
-  Rendezvous* rendez = session->rendezvous_mgr->Find(step_id);
+  Rendezvous* rendez = env_->rendezvous_mgr->Find(step_id);
   SchedNonBlockingClosureAfter(1000000, [rendez, step_id]() {
     // Delay a bit before aborting the step. This way, the root
     // cause may return first back to the client instead of this
@@ -193,8 +187,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
   const int64 step_id = request->step_id();
   TRACEPRINTF("RunGraph: %lld", step_id);
   WorkerSession* session =
-      env_->session_mgr->WorkerSessionForGraphHandle(request->graph_handle());
-  env_->session_mgr->AssociateStepIdWithGraph(request->graph_handle(), step_id);
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
   Status s = PrepareRunGraph(request, &in, out);
@@ -231,8 +224,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
   }
   CostGraphDef* cost_graph = response->mutable_cost_graph();
   session->graph_mgr->ExecuteAsync(
-      request->graph_handle(), step_id, request->exec_opts(), collector,
-      cost_graph, cm, in,
+      request->graph_handle(), step_id, session, request->exec_opts(),
+      collector, cost_graph, cm, in,
       [this, step_id, response, session, cm, out, token, collector, opts,
        done](Status s) {
         if (s.ok()) {
@@ -267,8 +260,8 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
   const string& graph_handle = request->graph_handle();
   TRACEPRINTF("PartialRunGraph: %lld", step_id);
   WorkerSession* session =
-      env_->session_mgr->WorkerSessionForGraphHandle(graph_handle);
-  env_->session_mgr->AssociateStepIdWithGraph(graph_handle, step_id);
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
   Status s = PrepareRunGraph(request, &in, out);
@@ -315,8 +308,8 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
                                               [cm]() { cm->StartCancel(); });
     }
     session->graph_mgr->ExecuteAsync(
-        graph_handle, step_id, request->exec_opts(), nullptr /* collector */,
-        nullptr /* cost_graph */, cm, in,
+        graph_handle, step_id, session, request->exec_opts(),
+        nullptr /* collector */, nullptr /* cost_graph */, cm, in,
         [this, token, graph_handle, step_id, cm](Status s) {
           {
             mutex_lock l(mu_);
@@ -365,8 +358,7 @@ void Worker::CleanupGraphAsync(const CleanupGraphRequest* request,
                                CleanupGraphResponse* response,
                                StatusCallback done) {
   const int64 step_id = request->step_id();
-  WorkerSession* session = env_->session_mgr->WorkerSessionForStepId(step_id);
-  session->rendezvous_mgr->Cleanup(step_id);
+  env_->rendezvous_mgr->Cleanup(step_id);
   done(Status::OK());
 }
 
@@ -394,8 +386,8 @@ void Worker::TracingAsync(const TracingRequest* request,
 Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
                                  Device** src_dev) {
   // Figures out which device the tensor is hosted on.
-  TF_RETURN_IF_ERROR(
-      env_->device_mgr->LookupDevice(parsed.src_device, src_dev));
+  string local_name = DeviceNameUtils::LocalName(parsed.src_device);
+  TF_RETURN_IF_ERROR(env_->device_mgr->LookupDevice(local_name, src_dev));
 
   // Does the device have the right incarnation number we expect?
   if ((*src_dev)->attributes().incarnation() != parsed.src_incarnation) {
diff --git a/tensorflow/core/distributed_runtime/worker_env.h b/tensorflow/core/distributed_runtime/worker_env.h
index 24fb5948a710df68e45a253dc9d614de43b4a889..f09bea328fd99426d07a853791df46cf579d93fd 100644
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
 
+#include <vector>
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -24,8 +25,10 @@ namespace thread {
 class ThreadPool;
 }  // namespace thread
 
+class Device;
 class DeviceMgr;
 class Env;
+class RendezvousMgrInterface;
 class SessionMgr;
 
 // The worker environment class, which holds a bag of pointers to
@@ -38,10 +41,18 @@ struct WorkerEnv {
   // session_mgr encapsulates state for each session.
   SessionMgr* session_mgr = nullptr;
 
+  // The local devices of this worker. Devices are owned by the device_mgr.
+  //
+  // REQUIRES: !local_devices.empty().
+  std::vector<Device*> local_devices;
+
   // device_mgr manages local devices (cpu and gpu). The WorkerService
   // is the network interface for managed devices.
   DeviceMgr* device_mgr = nullptr;
 
+  // A set of rendezvous keyed by step ids.
+  RendezvousMgrInterface* rendezvous_mgr = nullptr;
+
   // A pool of threads for scheduling compute work.
   thread::ThreadPool* compute_pool = nullptr;
 };
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index 508bc7f46803d61c1e915b5e0167f773403f92fb..c9db28ec67f86d469c16427aa9343a2a1d36c0e7 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -113,6 +113,11 @@ class WorkerInterface {
     return CallAndWait(&ME::GetStatusAsync, request, response);
   }
 
+  Status CreateWorkerSession(const CreateWorkerSessionRequest* request,
+                             CreateWorkerSessionResponse* response) {
+    return CallAndWait(&ME::CreateWorkerSessionAsync, request, response);
+  }
+
   Status RegisterGraph(const RegisterGraphRequest* request,
                        RegisterGraphResponse* response) {
     return CallAndWait(&ME::RegisterGraphAsync, request, response);
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index 8298e169595f0f4b4c89641c661c3a7882d97616..8691450e9bc42fe6ddae30e74c2b81ed85cab273 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -17,14 +17,84 @@ limitations under the License.
 
 namespace tensorflow {
 
-WorkerSession::WorkerSession(
-    const string& worker_name,
-    std::unique_ptr<WorkerCacheInterface> worker_cache,
-    std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr,
-    std::unique_ptr<GraphMgr> graph_mgr)
+namespace {
+
+// A private cache that wraps worker_cache and allows reuse of
+// WorkerInterface objects.
+class WorkerFreeListCache : public WorkerCacheInterface {
+ public:
+  explicit WorkerFreeListCache(std::unique_ptr<WorkerCacheInterface> w)
+      : wrapped_(std::move(w)) {}
+
+  ~WorkerFreeListCache() final {
+    for (auto p : workers_) {
+      wrapped_->ReleaseWorker(p.first, p.second.worker);
+    }
+  }
+
+  void ListWorkers(std::vector<string>* workers) const override {
+    wrapped_->ListWorkers(workers);
+  }
+
+  WorkerInterface* CreateWorker(const string& target) override {
+    mutex_lock l(mu_);
+    auto p = workers_.find(target);
+    if (p != workers_.end()) {
+      return p->second.worker;
+    }
+    WorkerState state;
+    state.worker = wrapped_->CreateWorker(target);
+    if (state.worker != nullptr) {
+      workers_.insert(std::make_pair(target, state));
+    }
+    return state.worker;
+  }
+
+  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
+    // TODO(jeff,sanjay): Should decrement ref-count when we implement eviction.
+  }
+
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
+  }
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    wrapped_->GetDeviceLocalityAsync(device, locality, done);
+  }
+
+  void SetLogging(bool active) override { wrapped_->SetLogging(active); }
+
+  void ClearLogs() override { wrapped_->ClearLogs(); }
+
+  bool RetrieveLogs(int64 step_id, StepStats* ss) override {
+    return wrapped_->RetrieveLogs(step_id, ss);
+  }
+
+ private:
+  std::unique_ptr<WorkerCacheInterface> wrapped_;
+
+  // Information kept per created WorkerInterface.
+  struct WorkerState {
+    WorkerInterface* worker;
+    // TODO(jeff,sanjay): Add reference count if we support eviction.
+  };
+
+  // TODO(jeff,sanjay): Eviction when the map becomes too big.
+  mutex mu_;
+  std::unordered_map<string, WorkerState> workers_ GUARDED_BY(mu_);
+};
+
+}  // namespace
+
+WorkerSession::WorkerSession(const string& worker_name,
+                             std::unique_ptr<WorkerCacheInterface> worker_cache,
+                             std::unique_ptr<DeviceMgr> device_mgr,
+                             std::unique_ptr<GraphMgr> graph_mgr)
     : worker_name(worker_name),
-      worker_cache(std::move(worker_cache)),
-      rendezvous_mgr(std::move(rendezvous_mgr)),
+      worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
+      device_mgr(std::move(device_mgr)),
       graph_mgr(std::move(graph_mgr)) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index e6ebe88329822569b866a38dc2c79fa11aac105a..77cf4de8f7455f1f5b9553890922e2c310018b6b 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -18,14 +18,13 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
-#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 
 namespace tensorflow {
 
 class GraphMgr;
-class RendezvousMgrInterface;
 class WorkerCacheInterface;
 
 // WorkerSession encapsulates all of the state relating to a given session.
@@ -36,17 +35,20 @@ struct WorkerSession {
   // Object from which WorkerInterface instances can be obtained.
   const std::unique_ptr<WorkerCacheInterface> worker_cache;
 
-  // A set of rendezvous keyed by step ids.
-  const std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr;
+  // Collection of local devices. These devices are typically RenamedDevices
+  // in all except the SessionMgr.legacy_session_. legacy_session_.device_mgr
+  // == worker_env_.device_mgr, which holds the true devices.
+  const std::unique_ptr<DeviceMgr> device_mgr;
 
   // graph_mgr keeps track of the registered graphs of this session.
   //
   // Note: graph_mgr must be deleted before rendezvous_mgr!
+  // Note: graph_mgr must be deleted before device_mgr!
   const std::unique_ptr<GraphMgr> graph_mgr;
 
   WorkerSession(const string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
-                std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr,
+                std::unique_ptr<DeviceMgr> device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr);
 };
 
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 452cfdda9e69ea9464e0f0db2af50470b600d4eb..3573cc6ec21d3b9503eff2616efdf884215cca05 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -400,16 +400,33 @@ void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out) {
   }
 }
 
+// Wrapper around protocol buffer serialization that requests deterministic
+// serialization, in particular for Map fields, which serialize in a random
+// order by default. Returns true on success.
+template <typename T>
+static bool DeterministicSerialization(const T& t, string* result) {
+  const int size = t.ByteSize();
+  *result = string(size, '\0');
+  ::tensorflow::protobuf::io::ArrayOutputStream array_stream(&(*result)[0],
+                                                             size);
+  ::tensorflow::protobuf::io::CodedOutputStream output_stream(&array_stream);
+  output_stream.SetSerializationDeterministic(true);
+  t.SerializeWithCachedSizes(&output_stream);
+  return !output_stream.HadError() && size == output_stream.ByteCount();
+}
+
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b) {
   string a_str, b_str;
-  a.SerializeToString(&a_str);
-  b.SerializeToString(&b_str);
+  DeterministicSerialization(a, &a_str);
+  DeterministicSerialization(b, &b_str);
   // Note: it should be safe to compare proto serializations of the attr
   // values since at most one field should be set in each (indeed, it
   // must be the same field if they are to compare equal).
   // Exception: there are multiple equivalent representations of
   // TensorProtos.  So a return value of true implies a == b, but not the
   // converse.
+  // TODO(phawkins): this is incorrect for NameAttrList attributes that may
+  // contain nested AttrValue maps.
   return a_str == b_str;
 }
 
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 8894671fdf3a22098cb2e6eca23d1adeb38a5f18..27fe28fe60a9bd020f9db16c49506741336c9863 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -115,7 +115,7 @@ class DeviceBase {
     cpu_worker_threads_ = t;
   }
 
-  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
+  virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
     CHECK(cpu_worker_threads_ != nullptr);
     return cpu_worker_threads_;
   }
@@ -140,7 +140,7 @@ class DeviceBase {
     gpu_device_info_ = g;
   }
 
-  const GpuDeviceInfo* tensorflow_gpu_device_info() const {
+  virtual const GpuDeviceInfo* tensorflow_gpu_device_info() const {
     return gpu_device_info_;
   }
 
@@ -170,13 +170,13 @@ class DeviceBase {
     return GetAllocator(attr);
   }
 
-  const Eigen::ThreadPoolDevice* eigen_cpu_device() {
+  virtual const Eigen::ThreadPoolDevice* eigen_cpu_device() {
     CHECK(eigen_cpu_device_ != nullptr);
     return eigen_cpu_device_;
   }
 
 #ifdef TENSORFLOW_USE_SYCL
-  const Eigen::SyclDevice* eigen_sycl_device() const {
+  virtual const Eigen::SyclDevice* eigen_sycl_device() const {
     CHECK(eigen_sycl_device_ != nullptr);
     return eigen_sycl_device_;
   }
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 8a7d96c38a91f2fe271fececb7ab289fe0450331..186095201d1efb4898595e15ec2145ee37fa9f07 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/function.pb_text.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -43,12 +45,11 @@ namespace {
 // Otherwise (arg_def is a simple type T), *is_type_list is set to
 // false, and *dtypes is set to a single element vector, whose only
 // element is T.
-Status ArgNumType(const InstantiateAttrValueMap& attrs,
-                  const OpDef::ArgDef& arg_def, bool* is_type_list,
-                  DataTypeVector* dtypes) {
+Status ArgNumType(AttrSlice attrs, const OpDef::ArgDef& arg_def,
+                  bool* is_type_list, DataTypeVector* dtypes) {
   dtypes->clear();
   if (!arg_def.type_list_attr().empty()) {
-    const AttrValue* v = gtl::FindOrNull(attrs, arg_def.type_list_attr());
+    const AttrValue* v = attrs.Find(arg_def.type_list_attr());
     if (v == nullptr) {
       return errors::NotFound("type attr not found: ",
                               arg_def.type_list_attr());
@@ -63,7 +64,7 @@ Status ArgNumType(const InstantiateAttrValueMap& attrs,
   *is_type_list = false;
   int num = 1;
   if (!arg_def.number_attr().empty()) {
-    const AttrValue* v = gtl::FindOrNull(attrs, arg_def.number_attr());
+    const AttrValue* v = attrs.Find(arg_def.number_attr());
     if (v == nullptr) {
       return errors::NotFound("type attr not found: ", arg_def.type_attr());
     }
@@ -76,7 +77,7 @@ Status ArgNumType(const InstantiateAttrValueMap& attrs,
   } else if (arg_def.type_attr().empty()) {
     dtype = DT_INVALID;
   } else {
-    const AttrValue* v = gtl::FindOrNull(attrs, arg_def.type_attr());
+    const AttrValue* v = attrs.Find(arg_def.type_attr());
     if (v == nullptr) {
       return errors::NotFound("type attr not found: ", arg_def.type_attr());
     }
@@ -91,18 +92,17 @@ void AddAttr(const string& name, const T& val, NodeDef* ndef) {
   SetAttrValue(val, &((*ndef->mutable_attr())[name]));
 }
 
-Status ValidateSignatureWithAttrs(const OpDef& sig,
-                                  const InstantiateAttrValueMap& attr_values) {
+Status ValidateSignatureWithAttrs(const OpDef& sig, AttrSlice attr_values) {
   // attr_values should specify all attrs defined in fdef.
   for (const auto& a : sig.attr()) {
-    auto const iter = attr_values.find(a.name());
-    if (iter == attr_values.end()) {
+    const AttrValue* v = attr_values.Find(a.name());
+    if (!v) {
       return errors::NotFound("Attr ", a.name(), " is not found from ",
                               SummarizeOpDef(sig));
     }
-    Status status = AttrValueHasType(iter->second, a.type());
+    Status status = AttrValueHasType(*v, a.type());
     if (!status.ok()) {
-      errors::AppendToMessage(&status, "for attr '", iter->first, "'");
+      errors::AppendToMessage(&status, "for attr '", a.name(), "'");
       return status;
     }
   }
@@ -145,7 +145,7 @@ class FunctionInstantiationHelper {
 
   // Builds index for nodes that can be used as node's input arguments.
   Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
-                            const InstantiateAttrValueMap& attr_values) {
+                            AttrSlice attr_values) {
     bool is_type_list;
     DataTypeVector dtypes;
     TF_RETURN_IF_ERROR(
@@ -174,8 +174,7 @@ class FunctionInstantiationHelper {
     return Status::OK();
   }
 
-  Status BuildNodeOutputIndex(const NodeDef& node,
-                              const InstantiateAttrValueMap& attrs,
+  Status BuildNodeOutputIndex(const NodeDef& node, AttrSlice attrs,
                               const int arg_index) {
     const OpDef* node_sig = nullptr;
     TF_RETURN_IF_ERROR(get_function_(node.op(), &node_sig));
@@ -205,8 +204,7 @@ class FunctionInstantiationHelper {
     return Status::OK();
   }
 
-  Status InstantiateNode(const NodeDef& fnode,
-                         const InstantiateAttrValueMap& attrs) {
+  Status InstantiateNode(const NodeDef& fnode, AttrSlice attrs) {
     const OpDef* fnode_sig = nullptr;
     TF_CHECK_OK(get_function_(fnode.op(), &fnode_sig));
     NodeDef* gnode = AddNode(fnode.name());
@@ -294,7 +292,7 @@ class FunctionInstantiationHelper {
   }
 
   Status AddReturnNode(
-      const OpDef::ArgDef& ret_def, const InstantiateAttrValueMap& attrs,
+      const OpDef::ArgDef& ret_def, AttrSlice attrs,
       const ::tensorflow::protobuf::Map<string, string>& ret_map,
       int* ret_index) {
     auto ret_iter = ret_map.find(ret_def.name());
@@ -582,14 +580,14 @@ string Print(const GraphDef& gdef) {
   for (size_t i = 0; i < arg.size(); ++i) {
     const NodeDef* n = arg[i];
     if (i > 0) strings::StrAppend(&out, ", ");
-    CHECK_EQ(2, n->attr_size());
+    CHECK_GE(n->attr_size(), 2);
     strings::StrAppend(&out, n->name(), ":", get_type(*n));
   }
   strings::StrAppend(&out, ") -> (");
   for (size_t i = 0; i < ret.size(); ++i) {
     const NodeDef* n = ret[i];
     if (i > 0) strings::StrAppend(&out, ", ");
-    CHECK_EQ(2, n->attr_size());
+    CHECK_LE(2, n->attr_size());
     CHECK_EQ(1, n->input_size());
     strings::StrAppend(&out, n->input(0), ":", get_type(*n));
   }
@@ -601,8 +599,9 @@ string Print(const GraphDef& gdef) {
   return out;
 }
 
-Status AddDefaultAttrs(const string& op, GetFunctionSignature get_function,
-                       InstantiateAttrValueMap* attrs) {
+Status AddDefaultAttrs(const string& op,
+                       const GetFunctionSignature& get_function,
+                       AttrValueMap* attrs) {
   const OpDef* op_def = nullptr;
   TF_RETURN_IF_ERROR(get_function(op, &op_def));
   AttrSlice attr_slice(attrs);
@@ -618,8 +617,7 @@ Status AddDefaultAttrs(const string& op, GetFunctionSignature get_function,
 
 }  // end namespace
 
-Status InstantiateFunction(const FunctionDef& fdef,
-                           const InstantiateAttrValueMap& attr_values,
+Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result) {
   VLOG(3) << "Instantiation Function: " << Print(fdef);
@@ -637,19 +635,17 @@ Status InstantiateFunction(const FunctionDef& fdef,
     }
   }
 
-  auto substitute = [&attr_values](const string& name, AttrValue* val) {
-    auto iter = attr_values.find(name);
-    if (iter == attr_values.end()) {
-      return false;
-    } else {
-      *val = iter->second;
+  auto substitute = [attr_values](StringPiece name, AttrValue* val) {
+    if (const AttrValue* v = attr_values.Find(name)) {
+      *val = *v;
       return true;
     }
+    return false;
   };
 
   // Makes a copy of all attrs in fdef and substitutes placeholders.
   // After this step, every attr is bound to a concrete value.
-  std::vector<InstantiateAttrValueMap> node_attrs;
+  std::vector<AttrValueMap> node_attrs;
   node_attrs.resize(fdef.node_def_size());
   for (int i = 0; i < fdef.node_def_size(); ++i) {
     for (auto attr : fdef.node_def(i).attr()) {
@@ -666,7 +662,7 @@ Status InstantiateFunction(const FunctionDef& fdef,
   }
 
   for (int i = 0; i < fdef.node_def_size(); ++i) {
-    s = helper.BuildNodeOutputIndex(fdef.node_def(i), node_attrs[i],
+    s = helper.BuildNodeOutputIndex(fdef.node_def(i), AttrSlice(&node_attrs[i]),
                                     result->gdef.node_size() + i);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
@@ -675,7 +671,7 @@ Status InstantiateFunction(const FunctionDef& fdef,
   }
   // Emits one gdef.node for each fdef.node_def.
   for (int i = 0; i < fdef.node_def_size(); ++i) {
-    s = helper.InstantiateNode(fdef.node_def(i), node_attrs[i]);
+    s = helper.InstantiateNode(fdef.node_def(i), AttrSlice(&node_attrs[i]));
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
       return s;
@@ -746,8 +742,7 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
   return true;
 }
 
-string Canonicalize(const string& funcname,
-                    const InstantiateAttrValueMap& attrs) {
+string Canonicalize(const string& funcname, AttrSlice attrs) {
   std::vector<string> entries;
   entries.reserve(attrs.size());
   for (auto p : attrs) {
@@ -882,6 +877,12 @@ Status FunctionLibraryDefinition::AddFunctionDef(const FunctionDef& fdef) {
                                    fdef.signature().name(),
                                    " already exists in function library.");
   }
+  const OpDef* op_def;
+  if (default_registry_->LookUpOpDef(fdef.signature().name(), &op_def).ok()) {
+    return errors::InvalidArgument(
+        "Cannot add function '", fdef.signature().name(),
+        "' because an op with the same name already exists.");
+  }
   ptr.reset(new FunctionDefAndOpRegistration(fdef));
   return Status::OK();
 }
@@ -945,8 +946,7 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   // If ndef is SymbolicGradient[f=Foo], we use Foo's gradient or
   // Foo's attributes.
   const NameAttrList* forward_func_attrs;
-  if (!GetNodeAttr(AttrSlice(&ndef.attr()), kFuncAttr, &forward_func_attrs)
-           .ok()) {
+  if (!GetNodeAttr(ndef, kFuncAttr, &forward_func_attrs).ok()) {
     return nullptr;
   }
   const string& func_name = forward_func_attrs->name();
@@ -973,34 +973,30 @@ FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
   return lib;
 }
 
-Status InstantiateFunction(const FunctionDef& fdef,
-                           InstantiateAttrValueSlice attr_values,
-                           GetFunctionSignature get_function,
-                           InstantiationResult* result) {
-  InstantiateAttrValueMap m;
-  for (const auto& aval : attr_values) {
-    m.insert({aval.first, aval.second.proto});
+template <typename T>
+Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
+                                          const string& attr, T* value) const {
+  const FunctionDef* fdef = GetAttrImpl(ndef);
+  if (fdef && GetNodeAttr(AttrSlice(&fdef->attr()), attr, value).ok()) {
+    return Status::OK();
   }
-  return InstantiateFunction(fdef, m, get_function, result);
+  return errors::InvalidArgument("Attr ", attr, " is not defined.");
 }
 
-string Canonicalize(const string& funcname, InstantiateAttrValueSlice attrs) {
-  InstantiateAttrValueMap m;
-  for (const auto& aval : attrs) {
-    m.insert({aval.first, aval.second.proto});
-  }
-  return Canonicalize(funcname, m);
+template <typename T>
+Status FunctionLibraryDefinition::GetAttr(const Node& node, const string& attr,
+                                          T* value) const {
+  return GetAttr(node.def(), attr, value);
 }
 
-Status FunctionLibraryRuntime::Instantiate(const string& function_name,
-                                           InstantiateAttrValueSlice attrs,
-                                           Handle* handle) {
-  InstantiateAttrValueMap m;
-  for (const auto& aval : attrs) {
-    m.insert({aval.first, aval.second.proto});
-  }
-  return Instantiate(function_name, m, handle);
-}
+#define GET_ATTR(T)                                                            \
+  template Status FunctionLibraryDefinition::GetAttr(const Node&,              \
+                                                     const string&, T*) const; \
+  template Status FunctionLibraryDefinition::GetAttr(const NodeDef&,           \
+                                                     const string&, T*) const;
+GET_ATTR(string)
+GET_ATTR(bool)
+#undef GET_ATTR
 
 void FunctionDefHelper::AttrValueWrapper::InitFromString(StringPiece val) {
   if (val.size() >= 2 && val[0] == '$') {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 210e5b949a5d7f35ee379d0573ad89a2237b0164..188c3855c6e6243b9d07c65ba9da5c2eec0bced9 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -36,6 +36,7 @@ class CancellationManager;
 class OpKernel;
 class ResourceMgr;
 class ScopedStepContainer;
+class Node;
 
 // FunctionDefHelper::Create is a convenient helper to construct a
 // FunctionDef proto.
@@ -190,11 +191,6 @@ inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(StringPiece val) {
 // InstantiateFunction calls "get_function" to find signatures of other
 // functions and primitive ops.
 
-// Placeholders in "fdef" is substituted based on "attr_values" here.
-typedef ::tensorflow::protobuf::Map<string, AttrValue> InstantiateAttrValueMap;
-typedef gtl::ArraySlice<std::pair<string, FunctionDefHelper::AttrValueWrapper>>
-    InstantiateAttrValueSlice;
-
 // GetFunctionSignature(func name, opdef) returns OK if the func name is found
 // and opdef is filled with a pointer to the corresponding signature
 // (a OpDef proto). Otherwise, returns an error.
@@ -206,12 +202,7 @@ struct InstantiationResult {
   DataTypeVector ret_types;
   GraphDef gdef;
 };
-Status InstantiateFunction(const FunctionDef& fdef,
-                           const InstantiateAttrValueMap& attr_values,
-                           GetFunctionSignature get_function,
-                           InstantiationResult* result);
-Status InstantiateFunction(const FunctionDef& fdef,
-                           InstantiateAttrValueSlice attr_values,
+Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result);
 
@@ -241,9 +232,7 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2);
 // space. But it may be change as the implementation
 // evolves. Therefore, it should not be persisted or compared across
 // address spaces.
-string Canonicalize(const string& funcname,
-                    const InstantiateAttrValueMap& attrs);
-string Canonicalize(const string& funcname, InstantiateAttrValueSlice attrs);
+string Canonicalize(const string& funcname, AttrSlice attrs);
 
 // Represents a function call frame. I.e., the data structure used to
 // pass arguments to a function and retrieve its results.
@@ -330,9 +319,16 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Given a node def 'ndef', inspects attributes of the callee
   // function to derive the attribute 'value' for 'attr'. Returns OK
   // iff the attribute is given by the function's definition.
+  // TODO(irving): Remove; keep only the const Node& version.
   template <typename T>
   Status GetAttr(const NodeDef& ndef, const string& attr, T* value) const;
 
+  // Given a node, inspects attributes of the callee function to derive the
+  // attribute 'value' for 'attr'. Returns OK iff the attribute is given by the
+  // function's definition.
+  template <typename T>
+  Status GetAttr(const Node& node, const string& attr, T* value) const;
+
   // Returns a proto representation of the state of this function library.
   FunctionDefLibrary ToProto() const;
 
@@ -375,11 +371,8 @@ class FunctionLibraryRuntime {
   // Returns OK and fills in "handle" if the instantiation succeeds.
   // Otherwise returns an error and "handle" is undefined.
   typedef uint64 Handle;
-  virtual Status Instantiate(const string& function_name,
-                             const InstantiateAttrValueMap& attrs,
+  virtual Status Instantiate(const string& function_name, AttrSlice attrs,
                              Handle* handle) = 0;
-  Status Instantiate(const string& function_name,
-                     InstantiateAttrValueSlice attrs, Handle* handle);
 
   // Returns the function body for the instantiated function given its
   // handle 'h'. Returns nullptr if "h" is not found.
@@ -506,17 +499,15 @@ bool RegisterOp(const string& op, Creator func);
 Status GetOpGradientCreator(const string& op, Creator* creator);
 };
 
-// Implementation details.
-
-template <typename T>
-Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
-                                          const string& attr, T* value) const {
-  const FunctionDef* fdef = GetAttrImpl(ndef);
-  if (fdef && GetNodeAttr(AttrSlice(&fdef->attr()), attr, value).ok()) {
-    return Status::OK();
-  }
-  return errors::InvalidArgument("Attr ", attr, " is not defined.");
-}
+// Declare explicit instantiations of GetAttr
+#define GET_ATTR(T)                                          \
+  extern template Status FunctionLibraryDefinition::GetAttr( \
+      const Node&, const string&, T*) const;                 \
+  extern template Status FunctionLibraryDefinition::GetAttr( \
+      const NodeDef&, const string&, T*) const;
+GET_ATTR(string)
+GET_ATTR(bool)
+#undef GET_ATTR
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index efc6a2edcc5ef612c4b3f8ea68ae3a592e0ccdeb..c83ecf4e5e8e7cf78ec50073baedd032a07545a8 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -29,6 +29,24 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+namespace {
+
+// A helper class to make AttrSlice from initializer lists
+class Attrs {
+ public:
+  Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
+        std::pair<string, FunctionDefHelper::AttrValueWrapper>>
+            attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
+  operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
+
+ private:
+  AttrValueMap map_;
+};
 
 typedef FunctionDefHelper FDH;
 
@@ -46,8 +64,6 @@ y: A scalar in type T.
 
 )doc");
 
-static InstantiateAttrValueMap kNoAttrs;
-
 TEST(TFunc, SquarePlusOne) {
   auto fdef = FDH::Create(
       // Name
@@ -81,7 +97,8 @@ SquarePlusOne[T:{float, double, int32, int64}](x:T) -> (y:T) {
 
   // Instantiate one with T=float
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result));
+  TF_ASSERT_OK(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result));
   const char* e2 = R"P(
 (x:float) -> (y:float) {
   a = Square[T=float](x)
@@ -126,7 +143,8 @@ ControlDep(x:int32) -> (y:int32) {
 
   // Instantiate one with T=float
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result));
+  TF_ASSERT_OK(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result));
   const char* e2 = R"P(
 (x:int32) -> (y:int32) {
   a = Identity[T=int32](x)
@@ -171,8 +189,7 @@ BackCompat() -> (y:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(
-      InstantiateFunction(fdef, InstantiateAttrValueMap{}, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   // Should get T=float from Op's default.
   const char* e2 = R"P(
 () -> (a:float) {
@@ -209,7 +226,7 @@ NTimesT(x:float, y:float) -> (z:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
 (x:float, y:float) -> (a:float) {
   a = AddN[N=2, T=float](x, y)
@@ -272,8 +289,8 @@ AddSquared[N:int, T:{float, double, int32, int64}](x:N*T) -> (y:T) {
 
   // Instantiate one with T=float
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"N", 3}, {"T", DT_FLOAT}}, GetOpSig,
-                                   &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, Attrs({{"N", 3}, {"T", DT_FLOAT}}),
+                                   GetOpSig, &result));
   const char* e2 = R"P(
 (x_0:float, x_1:float, x_2:float) -> (y:float) {
   a = Map[N=3, T=float, U=float, func=Square[T=float]](x_0, x_1, x_2)
@@ -315,7 +332,7 @@ ControlDeps(x:float) -> () {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
 (x:float) -> () {
   a = One[T=float]() @ x
@@ -395,7 +412,7 @@ Test(i:float) -> (o:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
 (i:float) -> (o:float) {
   zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
@@ -467,7 +484,7 @@ MySelect(x:float) -> (z:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
 (x:float) -> (z:float) {
   y = Cond[Tin={float}, cond=MyCond, else_branch=MyElse, out_types={float}, then_branch=MyThen](x)
@@ -488,8 +505,9 @@ TEST(InstantiateErrors, Not_Sufficient_Attrs) {
   auto fdef =
       FDH::Define("nop", {}, {}, {"T:{float, double, int32, int64}"}, {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, {{"U", DT_FLOAT}}, GetOpSig, &result),
-           "Attr T is not found from ");
+  HasError(
+      InstantiateFunction(fdef, Attrs({{"U", DT_FLOAT}}), GetOpSig, &result),
+      "Attr T is not found from ");
 }
 
 #if 0  // TODO(josh11b): Enable this test once having an extra attr is an error.
@@ -497,7 +515,7 @@ TEST(InstantiateErrors, Too_Many_Attrs) {
   auto fdef =
       FDH::Define("nop", {}, {}, {"T:{float, double, int32, int64}"}, {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, {{"T", DT_INT32}, {"U", DT_FLOAT}},
+  HasError(InstantiateFunction(fdef, Attrs({{"T", DT_INT32}, {"U", DT_FLOAT}}),
                                GetOpSig, &result),
            "Attr U is not found in ");
 }
@@ -508,7 +526,7 @@ TEST(InstantiateErrors, AttrValue_Value_Placeholder) {
       FDH::Define("nop", {}, {}, {"T:{float, double, int32, int64}"}, {});
   InstantiationResult result;
   HasError(
-      InstantiateFunction(fdef, {{"T", "$bad"}}, GetOpSig, &result),
+      InstantiateFunction(fdef, Attrs({{"T", "$bad"}}), GetOpSig, &result),
       "AttrValue had value with unexpected type 'placeholder'\n\tfor attr 'T'");
 }
 
@@ -518,14 +536,15 @@ TEST(InstantiateErrors, Unbounded_Attr) {
                               {{"a"}, "One", {}, {{"T", "$unknown"}}, {"x"}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result),
-           "Failed to bind all placeholders");
+  HasError(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result),
+      "Failed to bind all placeholders");
 }
 
 TEST(InstantiateErrors, DupArgs) {
   auto fdef = FDH::Define("test", {"x:float", "x:float"}, {}, {}, {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Duplicated arg name");
 }
 
@@ -536,7 +555,7 @@ TEST(InstantiateErrors, Dup_Node_Names) {
                               {{"y"}, "One", {}, {{"T", DT_FLOAT}}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Duplicated ret name");
 }
 
@@ -547,7 +566,7 @@ TEST(InstantiateErrors, Node_Arg_Notfound) {
                           },
                           {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input z is not found");
 }
 
@@ -557,7 +576,7 @@ TEST(InstantiateErrors, Node_Arg_TypeMismatch) {
                               {{"y"}, "Add", {"x", "x"}, {{"T", DT_INT32}}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input x[0] expected type int32 != float, the type of x[0]");
 }
 
@@ -568,7 +587,7 @@ TEST(InstantiateErrors, Node_Arg_ControlMissing) {
                       {{"y"}, "Add", {"x", "x"}, {{"T", DT_FLOAT}}, {"z"}},
                   });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input[2] == '^z', is not found.");
 }
 
@@ -579,7 +598,7 @@ TEST(InstantiateErrors, FuncRet_Missing) {
                           },
                           {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Return y missing");
 }
 
@@ -590,7 +609,7 @@ TEST(InstantiateErrors, FuncRet_NotFound) {
                           },
                           {{"y", "z"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Return y -> z is not found");
 }
 
@@ -601,7 +620,7 @@ TEST(InstantiateErrors, FuncRet_NameMismatch) {
                           },
                           {{"z", "x:y:0"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Return y missing");
 }
 
@@ -613,7 +632,7 @@ TEST(InstantiateErrors, FuncRet_NameMismatch) {
 //                           },
 //                           {{"y", "x:y:0"}, {"z", "x:y:0"}});
 //   InstantiationResult result;
-//   HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+//   HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
 //            "ret is not found");
 // }
 
@@ -623,7 +642,7 @@ TEST(InstantiateErrors, FuncRet_TypeMismatch) {
                               {{"y"}, "One", {}, {{"T", DT_DOUBLE}}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Invalid ret types y : float vs. double\n\tIn function output y");
 }
 
@@ -649,7 +668,7 @@ TEST(InstantiateErrors, TypeList_Missing_Retval_Attr) {
       },
       {{"y", "y:output"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "type attr not found: out_types");
 }
 
@@ -676,7 +695,7 @@ TEST(InstantiateErrors, TypeList_Num_Retval_Mismatch) {
       },
       {{"y", "y:output"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Invalid ret types");
 }
 
@@ -703,7 +722,7 @@ TEST(InstantiateErrors, TypeList_Missing_Arg) {
       },
       {{"y", "y:output"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input unknown is not found");
 }
 
@@ -724,7 +743,7 @@ TEST(InstantiateErrors, TooManyInputs) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Expected input[2] == 'x' to be a control input.");
 }
 
@@ -745,7 +764,7 @@ TEST(InstantiateErrors, TooFewInputs) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Attempt to access beyond input size: 2 >= 2");
 }
 
@@ -773,7 +792,7 @@ TEST(InstantiateErrors, TooManyInputsFromArray1) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Expected input[1] == 'y' to be a control input.");
 }
 
@@ -801,7 +820,7 @@ TEST(InstantiateErrors, TooManyInputsFromArray2) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Input a:output too long for inputs");
 }
 
@@ -822,7 +841,7 @@ TEST(InstantiateErrors, TypeMismatch) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input inputs[1] expected type float != int32, the type of y[0]");
 }
 
@@ -874,17 +893,17 @@ TEST(FunctionCallFrame, Float_Float_Float) {
 }
 
 TEST(Canonicalize, Basic) {
-  EXPECT_EQ(Canonicalize("MatMul", {{"T", DT_FLOAT},
-                                    {"transpose_a", false},
-                                    {"transpose_b", false}}),
+  EXPECT_EQ(Canonicalize("MatMul", Attrs({{"T", DT_FLOAT},
+                                          {"transpose_a", false},
+                                          {"transpose_b", false}})),
             "MatMul[T=float,transpose_a=false,transpose_b=false]");
-  EXPECT_EQ(Canonicalize("MatMul", {{"T", DT_FLOAT},
-                                    {"transpose_b", false},
-                                    {"transpose_a", false}}),
+  EXPECT_EQ(Canonicalize("MatMul", Attrs({{"T", DT_FLOAT},
+                                          {"transpose_b", false},
+                                          {"transpose_a", false}})),
             "MatMul[T=float,transpose_a=false,transpose_b=false]");
-  EXPECT_EQ(Canonicalize("MatMul", {{"T", DT_DOUBLE},
-                                    {"transpose_b", true},
-                                    {"transpose_a", false}}),
+  EXPECT_EQ(Canonicalize("MatMul", Attrs({{"T", DT_DOUBLE},
+                                          {"transpose_b", true},
+                                          {"transpose_a", false}})),
             "MatMul[T=double,transpose_a=false,transpose_b=true]");
 }
 
@@ -944,6 +963,15 @@ TEST(FunctionLibraryDefinitionTest, AddFunctionDef) {
   ASSERT_NE(second, nullptr);
   EXPECT_EQ(second->DebugString(),
             test::function::WXPlusB().signature().DebugString());
+
+  // Can't add function with same name as existing op
+  FunctionDef fdef = test::function::XTimesTwo();
+  fdef.mutable_signature()->set_name("Add");
+  Status s = lib_def.AddFunctionDef(fdef);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot add function 'Add' because an op with the same name "
+            "already exists.");
 }
 
 TEST(FunctionLibraryDefinitionTest, AddGradientDef) {
@@ -1139,4 +1167,5 @@ TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
 }
 
+}  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/graph_def_util_test.cc b/tensorflow/core/framework/graph_def_util_test.cc
index 8c76a74a4a524aae7cd50ed7b6a22ed852148d86..1ac322e48e2e6a9a572d8e85b01e166fc7e36f74 100644
--- a/tensorflow/core/framework/graph_def_util_test.cc
+++ b/tensorflow/core/framework/graph_def_util_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-Status FinalizeOpDef(OpDefBuilder b, OpDef* op_def) {
+Status FinalizeOpDef(const OpDefBuilder& b, OpDef* op_def) {
   OpRegistrationData op_reg_data;
   const Status s = b.Finalize(&op_reg_data);
   *op_def = op_reg_data.op_def;
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 14d8d91490e2e5e056f6023bbb517fd5d234f66b..c1dde1504a7cf647455c174b659bab3fb3792789 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/memory_types.h"
 
+#include <utility>
+
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -64,7 +66,7 @@ MemoryType MTypeFromDType(const DataType dtype) {
 }  // namespace
 
 Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
-                          DeviceType device_type, const NodeDef& ndef,
+                          const DeviceType& device_type, const NodeDef& ndef,
                           MemoryTypeVector* inp_mtypes,
                           MemoryTypeVector* out_mtypes) {
   // Look up the Op registered for this op name.
diff --git a/tensorflow/core/framework/memory_types.h b/tensorflow/core/framework/memory_types.h
index 3d4ca7597a43b29f2f0f53e287ee4bd705edb0a7..e35e22f5907b099afa8722e291fe408cf9c96fc5 100644
--- a/tensorflow/core/framework/memory_types.h
+++ b/tensorflow/core/framework/memory_types.h
@@ -28,7 +28,7 @@ namespace tensorflow {
 // REQUIRES: * '*_memory_types' is not nullptr.
 //           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
 Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
-                          DeviceType device_type, const NodeDef& ndef,
+                          const DeviceType& device_type, const NodeDef& ndef,
                           MemoryTypeVector* input_memory_types,
                           MemoryTypeVector* output_memory_types);
 
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 7e8ad507172b540cc009aa804485e2174fd0dd66..9b737e1f72d26f0c1db64553e24df65575d4b5b4 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -36,18 +37,23 @@ namespace tensorflow {
 const char* const kColocationAttrName = "_class";
 const char* const kColocationGroupPrefix = "loc:@";
 
+AttrSlice::AttrSlice() : ndef_(nullptr) {
+  static const AttrValueMap* const kEmptyAttrValueMap = new AttrValueMap;
+  attrs_ = kEmptyAttrValueMap;
+}
+
 AttrSlice::AttrSlice(const NodeDef& node_def)
     : ndef_(&node_def), attrs_(&ndef_->attr()) {}
 
 AttrSlice::AttrSlice(const AttrValueMap* a) : ndef_(nullptr), attrs_(a) {}
 
-string SummarizeNodeDef(const NodeDef& node_def) {
-  string ret = strings::StrCat(node_def.name(), " = ", node_def.op(), "[");
+static string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device) {
+  string ret;
 
   // We sort the attrs so the output is deterministic.
   std::vector<string> attr_names;
-  attr_names.reserve(node_def.attr().size());
-  for (const auto& attr : node_def.attr()) {
+  attr_names.reserve(attrs.size());
+  for (const auto& attr : attrs) {
     attr_names.push_back(attr.first);
   }
   std::sort(attr_names.begin(), attr_names.end());
@@ -55,20 +61,34 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   for (const string& attr_name : attr_names) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
-    auto iter = node_def.attr().find(attr_name);
-    strings::StrAppend(&ret, attr_name, "=", SummarizeAttrValue(iter->second));
+    strings::StrAppend(&ret, attr_name, "=",
+                       SummarizeAttrValue(*attrs.Find(attr_name)));
   }
 
   // Consider the device to be a final attr with name "_device".
-  if (!node_def.device().empty()) {
+  if (!device.empty()) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
-    strings::StrAppend(&ret, "_device=\"", node_def.device(), "\"");
+    strings::StrAppend(&ret, "_device=\"", device, "\"");
   }
+  return ret;
+}
+
+string AttrSlice::SummarizeNode() const {
+  return ndef_ ? SummarizeNodeDef(*ndef_)
+               : strings::StrCat(
+                     "[", SummarizeAttrsHelper(*this, StringPiece()), "]");
+}
+
+string SummarizeNode(const Node& node) { return SummarizeNodeDef(node.def()); }
+
+string SummarizeNodeDef(const NodeDef& node_def) {
+  string ret = strings::StrCat(node_def.name(), " = ", node_def.op(), "[");
+  strings::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
   strings::StrAppend(&ret, "](");
 
   // Output inputs, including control inputs, verbatim.
-  first = true;
+  bool first = true;
   for (const string& input : node_def.input()) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
@@ -79,9 +99,24 @@ string SummarizeNodeDef(const NodeDef& node_def) {
 }
 
 const AttrValue* AttrSlice::Find(StringPiece attr_name) const {
-  auto iter = attrs_->find(attr_name.ToString());
-  if (iter == attrs_->end()) return nullptr;
-  return &iter->second;
+  // Currently, the collection used for NodeDef::attr() (google::protobuf::Map)
+  // requires that the keys used for lookups have type 'const string&'. Because
+  // this method takes a StringPiece, it is necessary to allocate a temporary
+  // string, copy attr_name to it, and then use that temporary string for the
+  // lookup. This causes an excessive number of short-lived allocations, and for
+  // large graphs, this can be a significant cost.
+  //
+  // Because most nodes have a small number of attributes, a simple linear scan
+  // is generally more efficient than a hashed lookup.  If google::protobuf::Map
+  // changes so that it supports efficient lookups using StringPiece instead of
+  // const string&, then this code could be changed to use attrs_->find() again.
+
+  for (const auto& attr : *attrs_) {
+    if (attr.first == attr_name) {
+      return &attr.second;
+    }
+  }
+  return nullptr;
 }
 
 Status AttrSlice::Find(StringPiece attr_name,
@@ -94,12 +129,28 @@ Status AttrSlice::Find(StringPiece attr_name,
   // Skip AttachDef for internal attrs since it is a little bit
   // expensive and it is common for them to correctly not be included
   // in a NodeDef.
-  if (!StringPiece(attr_name).starts_with("_") && ndef_) {
+  if (!attr_name.starts_with("_") && ndef_ != nullptr) {
     s = AttachDef(s, *ndef_);
   }
   return s;
 }
 
+bool AttrSlice::EqualAttrs(AttrSlice other, Scratch* scratch) const {
+  if (size() != other.size()) return false;
+
+  for (const auto& attr : *other.attrs_) {
+    auto iter = attrs_->find(attr.first);
+    if (iter == attrs_->end()) return false;
+    // TODO(irving): Comparing AttrValues by proto is slightly buggy, since
+    // TensorProto is a nonunique representation of Tensor.  This bug will go
+    // away once AttrSlice switches over to NodeInfo.
+    iter->second.SerializeToString(&scratch->a);
+    attr.second.SerializeToString(&scratch->b);
+    if (scratch->a != scratch->b) return false;
+  }
+  return true;
+}
+
 // The ... is to allow the caller to inject some value validation code.  Use
 // just ; if no additional validation code is needed.
 #define DEFINE_GET_ATTR(TYPE, FIELD, ATTR_TYPE, APPEND_OP, CAST, ...)         \
@@ -125,7 +176,41 @@ Status AttrSlice::Find(StringPiece attr_name,
     return Status::OK();                                                      \
   }
 
+#define DEFINE_GET_ATTR_SIMPLE(TYPE, FIELD, ATTR_TYPE, APPEND_OP, CAST, ...) \
+  bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,      \
+                         TYPE* value) {                                      \
+    const AttrValue* attr_value = attrs.Find(attr_name);                     \
+    if (attr_value == nullptr) {                                             \
+      return false;                                                          \
+    }                                                                        \
+    Status s = AttrValueHasType(*attr_value, ATTR_TYPE);                     \
+    if (!s.ok()) {                                                           \
+      return false;                                                          \
+    }                                                                        \
+    const auto& v = attr_value->FIELD();                                     \
+    __VA_ARGS__;                                                             \
+    *value = CAST;                                                           \
+    return true;                                                             \
+  }                                                                          \
+  bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,      \
+                         std::vector<TYPE>* value) {                         \
+    const AttrValue* attr_value = attrs.Find(attr_name);                     \
+    if (attr_value == nullptr) {                                             \
+      return false;                                                          \
+    }                                                                        \
+    Status s = AttrValueHasType(*attr_value, "list(" ATTR_TYPE ")");         \
+    if (!s.ok()) {                                                           \
+      return false;                                                          \
+    }                                                                        \
+    for (const auto& v : attr_value->list().FIELD()) {                       \
+      __VA_ARGS__;                                                           \
+      value->APPEND_OP(CAST);                                                \
+    }                                                                        \
+    return true;                                                             \
+  }
+
 DEFINE_GET_ATTR(string, s, "string", emplace_back, v, ;)
+DEFINE_GET_ATTR_SIMPLE(string, s, "string", emplace_back, v, ;)
 DEFINE_GET_ATTR(int64, i, "int", emplace_back, v, ;)
 DEFINE_GET_ATTR(int32, i, "int", emplace_back, static_cast<int32>(v),
                 if (static_cast<int64>(static_cast<int32>(v)) != v) {
@@ -156,6 +241,20 @@ DEFINE_GET_ATTR(Tensor, tensor, "tensor", emplace_back, t, Tensor t;
 
 #undef DEFINE_GET_ATTR
 
+static const string& kEmptyString = *new string();
+
+const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name) {
+  const AttrValue* attr_value = attrs.Find(attr_name);
+  if (attr_value == nullptr) {
+    return kEmptyString;
+  }
+  Status s = AttrValueHasType(*attr_value, "string");
+  if (!s.ok()) {
+    return kEmptyString;
+  }
+  return attr_value->s();
+}
+
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    DataTypeVector* value) {
   const AttrValue* attr_value;
@@ -278,14 +377,14 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
     if (StringPiece(input).starts_with("^")) {
       seen_control = true;
       if (input.find(':') != string::npos) {
-        return errors::InvalidArgument("Control input '", input,
-                                       "' must not have ':' in NodeDef: ",
-                                       SummarizeNodeDef(node_def));
+        return errors::InvalidArgument(
+            "Control input '", input,
+            "' must not have ':' in NodeDef: ", SummarizeNodeDef(node_def));
       }
     } else if (seen_control) {
-      return errors::InvalidArgument("Non-control input '", input,
-                                     "' after control input in NodeDef: ",
-                                     SummarizeNodeDef(node_def));
+      return errors::InvalidArgument(
+          "Non-control input '", input,
+          "' after control input in NodeDef: ", SummarizeNodeDef(node_def));
     } else {
       ++num_inputs;
     }
@@ -295,8 +394,8 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   for (const auto& attr : op_def.attr()) {
     if (!gtl::InsertIfNotPresent(&op_attrs, attr.name(), &attr)) {
       return errors::InvalidArgument("OpDef has duplicate attr name '",
-                                     attr.name(), "': ",
-                                     SummarizeOpDef(op_def));
+                                     attr.name(),
+                                     "': ", SummarizeOpDef(op_def));
     }
   }
   for (const auto& attr : node_def.attr()) {
@@ -320,8 +419,9 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
           "with your GraphDef-generating binary.).");
     }
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        ValidateAttrValue(attr.second, *iter->second), "; NodeDef: ",
-        SummarizeNodeDef(node_def), "; ", SummarizeOpDef(op_def));
+        ValidateAttrValue(attr.second, *iter->second),
+        "; NodeDef: ", SummarizeNodeDef(node_def), "; ",
+        SummarizeOpDef(op_def));
     // Keep track of which attr names have (not) been found in the NodeDef.
     op_attrs.erase(iter);
   }
@@ -368,9 +468,9 @@ Status ComputeArgRange(const NodeDef& node_def, const OpDef::ArgDef& arg_def,
   } else if (!arg_def.type_attr().empty() || arg_def.type() != DT_INVALID) {
     *num = 1;
   } else {
-    return errors::InvalidArgument("Argument '", arg_def.name(),
-                                   "' incorrectly specified in op definition: ",
-                                   SummarizeOpDef(op_def));
+    return errors::InvalidArgument(
+        "Argument '", arg_def.name(),
+        "' incorrectly specified in op definition: ", SummarizeOpDef(op_def));
   }
   return Status::OK();
 }
@@ -402,6 +502,11 @@ Status NameRangesForNode(const NodeDef& node_def, const OpDef& op_def,
   return Status::OK();
 }
 
+Status NameRangesForNode(const Node& node, const OpDef& op_def,
+                         NameRangeMap* inputs, NameRangeMap* outputs) {
+  return NameRangesForNode(node.def(), op_def, inputs, outputs);
+}
+
 void AddDefaultsToNodeDef(const OpDef& op_def, NodeDef* node_def) {
   for (const auto& attr_def : op_def.attr()) {
     AttrSlice attrs(*node_def);
@@ -502,4 +607,8 @@ Status AttachDef(const Status& status, const NodeDef& node_def) {
   return ret;
 }
 
+Status AttachDef(const Status& status, const Node& node) {
+  return AttachDef(status, node.def());
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 5c4d2272682de59221002e1b863007efdff6a321..1438abdec606442246baba00cc6ca818c4cec7d5 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -29,6 +29,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class Node;
+
 // Name of the attribute used to encode node colocation constraints.
 //
 // Nodes can be co-located on the same device. Desire for explicit co-location
@@ -39,8 +41,9 @@ extern const char* const kColocationAttrName;
 // String prefix applied to the operation name for colocation constraints.
 extern const char* const kColocationGroupPrefix;
 
-// Produce a human-readable version of a NodeDef that is more concise
+// Produce a human-readable version of a Node or NodeDef that is more concise
 // than a text-format proto.
+string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
 
 typedef protobuf::Map<string, AttrValue> AttrValueMap;
@@ -78,8 +81,11 @@ class AttrSlice {
  public:
   AttrSlice(const NodeDef& node_def);  // NOLINT(runtime/explicit)
 
+  AttrSlice();  // Empty
   explicit AttrSlice(const AttrValueMap* a);
 
+  int size() const { return attrs_->size(); }
+
   // Returns the attr with attr_name if found.  Otherwise, returns
   // nullptr.
   const AttrValue* Find(StringPiece attr_name) const;
@@ -88,6 +94,33 @@ class AttrSlice {
   // NotFound status.
   Status Find(StringPiece attr_name, const AttrValue** attr_value) const;
 
+  // Helper class to avoid allocations in EqualAttrs.
+  // TODO(irving): Will go away once NodeInfo is used.
+  struct Scratch {
+    string a;
+    string b;
+  };
+
+  // Check if all attrs and attr values match.  Does not take defaults into
+  // account.
+  //
+  // TODO(irving): There is a bug in this routine inherited from its
+  // OptimizerCSE::EqualAttrs precedecessor.  The same tensor attr can be
+  // represented in more than one way as an AttrValue, since TensorProto is
+  // not 1-1.  This bug will go away once I replace everything with NodeInfo,
+  // which stores a Tensor object directly.  The Scratch object will also go
+  // away.
+  bool EqualAttrs(AttrSlice other, Scratch* scratch) const;
+
+  // If this AttrSlice has an attached NodeDef, summarize it.  This is for
+  // error messages only: we intentionally do not provide direct access to the
+  // NodeDef, since it is not always there.
+  string SummarizeNode() const;
+
+  // Iteration over all attrs
+  AttrValueMap::const_iterator begin() const { return attrs_->begin(); }
+  AttrValueMap::const_iterator end() const { return attrs_->end(); }
+
  private:
   const NodeDef* ndef_;
   const AttrValueMap* attrs_;
@@ -153,6 +186,20 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    std::vector<NameAttrList>* value);  // type: "list(func)"
 
+// Look up the attr with name attr_name and set *value to its value.  If no
+// attr with attr_name is found in node_def, or the attr does not have
+// a matching type, false is returned.
+bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,
+                       string* value);  // type: "string"
+bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,
+                       std::vector<string>* value);  // type: "string"
+
+// Look up the attr with name attr_name and return a reference to its value.
+// If no attr with attr_name is found in node_def, or the attr does not have
+// a matching type, a reference to an empty string is returned.
+// REQUIRES: Must not use the returned value beyond the lifetime of node_def.
+const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name);
+
 // Computes the input and output types for a specific node.
 // REQUIRES: ValidateOpDef(op_def).ok()
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
@@ -169,9 +216,12 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def);
 // corresponding input/output index range.  For example,
 // input "foo" corresponds to input indices
 //   [ (*inputs)["foo"].first, (*inputs)["foo"].second ).
+// TODO(irving): Remove the NodeDef version; keep only the Node version.
 typedef std::unordered_map<string, std::pair<int, int>> NameRangeMap;
 Status NameRangesForNode(const NodeDef& node_def, const OpDef& op_def,
                          NameRangeMap* inputs, NameRangeMap* outputs);
+Status NameRangesForNode(const Node& node, const OpDef& op_def,
+                         NameRangeMap* inputs, NameRangeMap* outputs);
 
 // Adds default values to *node_def for unspecified attrs from op_def.
 void AddDefaultsToNodeDef(const OpDef& op_def, NodeDef* node_def);
@@ -192,6 +242,7 @@ Status ValidateExternalNodeDefSyntax(const NodeDef& node_def);
 // Returns "status" with kernel's NodeDef attached as additional text
 // in the error message.
 Status AttachDef(const Status& status, const NodeDef& node_def);
+Status AttachDef(const Status& status, const Node& node);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index a6ffd5c59618883874587fb608dd19ec9b714910..bde5bb2c397ed98c1c1ed5bf2178ecbdbc324e2e 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -73,7 +73,7 @@ class OpDefBuilderTest : public ::testing::Test {
     }
   }
 
-  void ExpectFailure(const OpDefBuilder& builder, string error) {
+  void ExpectFailure(const OpDefBuilder& builder, const string& error) {
     OpRegistrationData op_reg_data;
     Status status = builder.Finalize(&op_reg_data);
     EXPECT_FALSE(status.ok());
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 3d913cdaf0c7be83883c1c0960fa9eb177307cb1..6c3917c6869e654cfa550fb2f0e845dc4967d803 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -89,9 +91,9 @@ OpKernel::OpKernel(OpKernelConstruction* context)
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
-                 NameRangesForNode(def_, context->op_def(), &input_name_map_,
+                 NameRangesForNode(def_, *context->op_def_, &input_name_map_,
                                    &output_name_map_));
-  OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(),
+  OP_REQUIRES_OK(context, CheckOpDeprecation(*context->op_def_,
                                              context->graph_def_version()));
 
   // Kernels executing on GPU tie very few resources on the CPU where the
@@ -656,22 +658,6 @@ Status OpKernelContext::allocate_persistent(DataType type,
       *out_tensor = out_persistent->AccessTensor(this);
     }
   }
-  if (track_allocations() && persistent.TotalBytes() > 0) {
-    // TODO(yuefengz): some allocators allocate memory even if the requested
-    // size is 0.
-    Allocator* a = get_allocator(attr);
-    if (a->TracksAllocationSizes()) {
-      int64 alloc_size =
-          a->AllocatedSize(const_cast<char*>(persistent.tensor_data().data()));
-      int64 alloc_id =
-          a->AllocationId(const_cast<char*>(persistent.tensor_data().data()));
-      if (allocate_on_host(attr)) {
-        record_host_persistent_memory_allocation(alloc_size, alloc_id);
-      } else {
-        record_device_persistent_memory_allocation(alloc_size, alloc_id);
-      }
-    }
-  }
   return s;
 }
 
@@ -823,7 +809,7 @@ static KernelRegistry* GlobalKernelRegistryTyped() {
   return reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
 }
 
-static string Key(StringPiece op_type, DeviceType device_type,
+static string Key(StringPiece op_type, const DeviceType& device_type,
                   StringPiece label) {
   return strings::StrCat(op_type, ":", DeviceTypeString(device_type), ":",
                          label);
@@ -857,13 +843,10 @@ bool InTypeList(DataType dt, const AttrValue& type_list) {
   return false;
 }
 
-// Returns whether the attrs in the NodeDef satisfy the constraints in
-// the kernel_def.  Returns an error if attrs in kernel_def are not
-// found, or have a mismatching type.
-Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
-                  bool* match) {
+// Returns whether the attrs satisfy the constraints in the kernel_def.  Returns
+// an error if attrs in kernel_def are not found, or have a mismatching type.
+Status AttrsMatch(AttrSlice attrs, const KernelDef& kernel_def, bool* match) {
   *match = false;
-  AttrSlice attrs(node_def);
   for (const auto& constraint : kernel_def.constraint()) {
     if (constraint.allowed_values().list().type_size() == 0) {
       return errors::Unimplemented(
@@ -887,7 +870,7 @@ Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
               "' that has value '", SummarizeAttrValue(*found),
               "' that does not have type 'type' or 'list(type)' in NodeDef "
               "'",
-              SummarizeNodeDef(node_def), "'");
+              attrs.SummarizeNode(), "'");
         }
 
         for (int t : found->list().type()) {
@@ -900,7 +883,7 @@ Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
     } else {
       return errors::InvalidArgument(
           "OpKernel '", kernel_def.op(), "' has constraint on attr '",
-          constraint.name(), "' not in NodeDef '", SummarizeNodeDef(node_def),
+          constraint.name(), "' not in NodeDef '", attrs.SummarizeNode(),
           "', KernelDef: '", ProtoShortDebugString(kernel_def), "'");
     }
   }
@@ -908,13 +891,18 @@ Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
   return Status::OK();
 }
 
-Status FindKernelRegistration(DeviceType device_type, const NodeDef& node_def,
+static const StringPiece kKernelAttr("_kernel");
+
+// TODO(irving): Replace with const Node& version below.
+Status FindKernelRegistration(const DeviceType& device_type,
+                              const NodeDef& node_def,
                               const KernelRegistration** reg,
                               bool* was_attr_mismatch) {
   *reg = nullptr;
   *was_attr_mismatch = false;
-  string label;  // Label defaults to empty if not found in NodeDef.
-  GetNodeAttr(node_def, "_kernel", &label).IgnoreError();
+  // Label defaults to empty if not found in NodeDef.
+  const string& label = GetNodeAttrString(node_def, kKernelAttr);
+
   const string key = Key(node_def.op(), device_type, label);
   auto regs = GlobalKernelRegistryTyped()->equal_range(key);
   for (auto iter = regs.first; iter != regs.second; ++iter) {
@@ -938,9 +926,17 @@ Status FindKernelRegistration(DeviceType device_type, const NodeDef& node_def,
   return Status::OK();
 }
 
+Status FindKernelRegistration(const DeviceType& device_type, const Node& node,
+                              const KernelRegistration** reg,
+                              bool* was_attr_mismatch) {
+  return FindKernelRegistration(device_type, node.def(), reg,
+                                was_attr_mismatch);
+}
+
 }  // namespace
 
-Status FindKernelDef(DeviceType device_type, const NodeDef& node_def,
+// TODO(irving): Change const NodeDef& to const Node&
+Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
                      const KernelDef** def, string* kernel_class_name) {
   const KernelRegistration* reg = nullptr;
   bool was_attr_mismatch;
@@ -1022,8 +1018,8 @@ std::unique_ptr<OpKernel> CreateOpKernel(
     DeviceType device_type, DeviceBase* device, Allocator* allocator,
     const NodeDef& node_def, int graph_def_version, Status* status) {
   OpKernel* kernel = nullptr;
-  *status = CreateOpKernel(device_type, device, allocator, nullptr, node_def,
-                           graph_def_version, &kernel);
+  *status = CreateOpKernel(std::move(device_type), device, allocator, nullptr,
+                           node_def, graph_def_version, &kernel);
   return std::unique_ptr<OpKernel>(kernel);
 }
 
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 48bb69cb4e4f7dbe18cb0a172d2f189ff28f6198..465395d858c612e0c2eb12ae21a9558e9ca60c08 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 
+#include <utility>
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cancellation.h"
@@ -227,7 +228,7 @@ class OpKernelConstruction {
                        const DataTypeSlice& output_types,
                        const MemoryTypeSlice& output_memory_types,
                        int graph_def_version, Status* status)
-      : device_type_(device_type),
+      : device_type_(std::move(device_type)),
         device_(device),
         allocator_(allocator),
         def_(node_def),
@@ -277,9 +278,6 @@ class OpKernelConstruction {
   // User-supplied configuration of this operation.
   const NodeDef& def() const { return *def_; }
 
-  // Op registered for this op type.
-  const OpDef& op_def() const { return *op_def_; }
-
   // For inspecting the inputs to this operation.
   int num_inputs() const { return input_types_.size(); }
   DataType input_type(int i) const { return input_types_[i]; }
@@ -353,6 +351,10 @@ class OpKernelConstruction {
   const int graph_def_version_;
   Status* status_;
 
+  // Allow op_def_ across from OpKernel, but not from subclasses.
+  // TODO(irving): Remove protos from this header entirely.
+  friend class OpKernel;
+
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelConstruction);
 };
 
@@ -432,6 +434,7 @@ class OpOutputList {
   OpOutputList& operator=(const OpOutputList& other) = default;
   Tensor* operator[](int i);
   bool required(int i) const;
+  DataType expected_output_dtype(int i) const;
   Status allocate(int i, const TensorShape& shape, Tensor** output);
   void set(int i, const Tensor& tensor);
   void set_ref(int i, mutex* mu, Tensor* tensor_for_ref);
@@ -1252,7 +1255,7 @@ void* GlobalKernelRegistry();
 // If node_def has a corresponding kernel registered on device_type,
 // returns OK and fill in the kernel def and kernel_class_name. <def> and
 // <kernel_class_name> may be null.
-Status FindKernelDef(DeviceType device_type, const NodeDef& node_def,
+Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
                      const KernelDef** def, string* kernel_class_name);
 
 // Writes a list of all registered kernels to LOG(INFO), to help users debug
@@ -1452,6 +1455,12 @@ inline bool OpOutputList::required(int i) const {
   return ctx_->output_required(start_ + i);
 }
 
+inline DataType OpOutputList::expected_output_dtype(int i) const {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->expected_output_dtype(start_ + i);
+}
+
 inline Status OpOutputList::allocate(int i, const TensorShape& shape,
                                      Tensor** output) {
   DCHECK_GE(i, 0);
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 1c561899159e42269b55c81f3838325207623f03..e8e931b52e40f8440145856345c9bb7e314551d9 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include <memory>
+#include <utility>
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -133,8 +134,8 @@ class OpKernelTest : public ::testing::Test {
                      const DataTypeVector& outputs) {
     Status status;
     std::unique_ptr<OpKernel> op(CreateOpKernel(
-        device_type, &device_, cpu_allocator(), CreateNodeDef(op_type, inputs),
-        TF_GRAPH_DEF_VERSION, &status));
+        std::move(device_type), &device_, cpu_allocator(),
+        CreateNodeDef(op_type, inputs), TF_GRAPH_DEF_VERSION, &status));
     EXPECT_TRUE(status.ok()) << status;
     EXPECT_TRUE(op != nullptr);
     if (op != nullptr) {
@@ -148,9 +149,9 @@ class OpKernelTest : public ::testing::Test {
     NodeDef node_def;
     protobuf::TextFormat::ParseFromString(ascii_node_def, &node_def);
     Status status;
-    std::unique_ptr<OpKernel> op(CreateOpKernel(device_type, &device_,
-                                                cpu_allocator(), node_def,
-                                                TF_GRAPH_DEF_VERSION, &status));
+    std::unique_ptr<OpKernel> op(
+        CreateOpKernel(std::move(device_type), &device_, cpu_allocator(),
+                       node_def, TF_GRAPH_DEF_VERSION, &status));
     EXPECT_TRUE(op == nullptr);
     EXPECT_FALSE(status.ok());
     if (!status.ok()) {
@@ -384,7 +385,7 @@ class OpKernelBuilderTest : public ::testing::Test {
   }
 
   std::unique_ptr<OpKernel> ExpectSuccess(const string& op_type,
-                                          DeviceType device_type,
+                                          const DeviceType& device_type,
                                           const std::vector<string>& attrs,
                                           DataTypeSlice input_types = {}) {
     Status status;
@@ -423,7 +424,7 @@ class OpKernelBuilderTest : public ::testing::Test {
     return op;
   }
 
-  void ExpectFailure(const string& op_type, DeviceType device_type,
+  void ExpectFailure(const string& op_type, const DeviceType& device_type,
                      const std::vector<string>& attrs, error::Code code) {
     Status status;
     const NodeDef def = CreateNodeDef(op_type, attrs);
@@ -613,6 +614,36 @@ TEST_F(OpKernelBuilderTest, BadConstraint) {
                 error::INVALID_ARGUMENT);
 }
 
+REGISTER_OP("ListOut").Output("a: int32").Output("b: T").Attr("T: list(type)");
+REGISTER_KERNEL_BUILDER(Name("ListOut").Device(tensorflow::DEVICE_CPU),
+                        DummyKernel);
+
+TEST_F(OpKernelBuilderTest, OpOutputList) {
+  Env* env = Env::Default();
+  OpKernelContext::Params params;
+  params.record_tensor_accesses = false;
+  std::unique_ptr<DummyDevice> device(
+      new DummyDevice(env, params.record_tensor_accesses));
+  params.device = device.get();
+  Status status;
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, params.device, cpu_allocator(),
+      CreateNodeDef("ListOut", {"T|list(type)|[DT_FLOAT, DT_INT32]"}),
+      TF_GRAPH_DEF_VERSION, &status));
+  EXPECT_TRUE(status.ok()) << status.ToString();
+  params.op_kernel = op.get();
+  gtl::InlinedVector<TensorValue, 4> inputs{};
+  params.inputs = &inputs;
+  std::unique_ptr<OpKernelContext> ctx(new OpKernelContext(&params));
+
+  EXPECT_EQ(DT_INT32, ctx->expected_output_dtype(0));
+  OpOutputList out_list;
+  EXPECT_FALSE(ctx->output_list("non_existent_output", &out_list).ok());
+  ASSERT_TRUE(ctx->output_list("b", &out_list).ok());
+  EXPECT_EQ(DT_FLOAT, out_list.expected_output_dtype(0));
+  EXPECT_EQ(DT_INT32, out_list.expected_output_dtype(1));
+}
+
 class GetAttrKernel : public ::tensorflow::OpKernel {
  public:
   explicit GetAttrKernel(OpKernelConstruction* context) : OpKernel(context) {
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 7f9fe084ba4b3f1fb9ef6052f0ba7adbdcdd913f..ab7dd0c547545c546bdb313927d9aea27422af3a 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -246,6 +246,14 @@ ResourceHandle HandleFromInput(OpKernelContext* ctx, int input) {
   return ctx->input(input).flat<ResourceHandle>()(0);
 }
 
+Status HandleFromInput(OpKernelContext* ctx, StringPiece input,
+                       ResourceHandle* handle) {
+  const Tensor* tensor;
+  TF_RETURN_IF_ERROR(ctx->input(input, &tensor));
+  *handle = tensor->flat<ResourceHandle>()(0);
+  return Status::OK();
+}
+
 Status DeleteResource(OpKernelContext* ctx, const ResourceHandle& p) {
   TF_RETURN_IF_ERROR(internal::ValidateDevice(ctx, p));
   return ctx->resource_manager()->Delete(p);
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index fe6e09378fd1fa2dbdb52c138a54be7d8fb7b86b..26a5766569f608a1edb38a88e9e0096858a390bb 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -211,6 +211,8 @@ ResourceHandle MakePerStepResourceHandle(OpKernelContext* ctx,
 
 // Returns a resource handle from a numbered op input.
 ResourceHandle HandleFromInput(OpKernelContext* ctx, int input);
+Status HandleFromInput(OpKernelContext* ctx, StringPiece input,
+                       ResourceHandle* handle);
 
 // Create a resource pointed by a given resource handle.
 template <typename T>
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index e88f6dbb042ca656414d60f09b8581b0cc1a4470..d064a8ec4dc1b418822fc866b4f72d86cd82830f 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -184,13 +184,26 @@ class InferenceContext {
     }
 #ifndef NDEBUG
     for (int i = 0; i < num_outputs(); ++i) {
-      DCHECK(output(i).IsSet()) << i << " for " << node_def().name()
-                                << " of type " << node_def().op();
+      DCHECK(output(i).IsSet())
+          << i << " for " << node_def_.name() << " of type " << node_def_.op();
     }
 #endif  // NDEBUG
     return s;
   }
 
+  // Merge the stored shape of the input in position idx with the specified
+  // shape. This requires idx to be in the [0, num_inputs) range. If the merge
+  // is successful and the new shape differs from the old one, store the new
+  // shape and return true. Return false otherwise.
+  bool MergeInput(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(inputs_[idx], shape, &new_shape).ok() ||
+        inputs_[idx].SameHandle(new_shape)) {
+      return false;
+    }
+    inputs_[idx] = new_shape;
+    return true;
+  }
   ShapeHandle input(int64 idx) const { return inputs_[idx]; }
   Status input(StringPiece input_name, std::vector<ShapeHandle>* output) const;
   int num_inputs() const { return inputs_.size(); }
@@ -381,11 +394,6 @@ class InferenceContext {
   // the value.
   Status MakeDimForScalarInput(int idx, DimensionHandle* out);
 
-  // Returns the NodeDef. The returned reference does not outlive the
-  // InferenceContext, and it should not be used after InferenceContext is
-  // destroyed.
-  const NodeDef& node_def() { return node_def_; }
-
   // Look up the attr for the NodeDef being evaluated with name attr_name and
   // set *value to its value.  If no attr with attr_name is found in def(), or
   // the attr does not have a matching type, a non-ok status will be returned.
@@ -430,15 +438,65 @@ class InferenceContext {
   // and dtypes of tensors which can be accessed via the handle. These methods
   // propagate that information. Output handle dtypes and shapes are ignored if
   // the output tensor is not of type DT_RESOURCE.
+
+  // Merge the stored shape corresponding to the input handle in position idx
+  // with the specified shape. This requires idx to be in the [0, num_inputs)
+  // range. If the merge is successful and the new shape differs from the old
+  // one, store the new shape and return true. Return false otherwise.
+  bool MergeInputHandleShape(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(input_handle_shape_[idx], shape, &new_shape).ok() ||
+        input_handle_shape_[idx].SameHandle(new_shape)) {
+      return false;
+    }
+    input_handle_shape_[idx] = shape;
+    return true;
+  }
+
+  // Set the type corresponding to the resource in position idx. This requires
+  // idx to be in the [0, num_inputs) range. Returns true iff the stored type
+  // has been updated.
+  bool set_input_handle_dtype(int idx, DataType dtype) {
+    if (input_handle_dtype_[idx] != dtype) {
+      input_handle_dtype_[idx] = dtype;
+      return true;
+    }
+    return false;
+  }
   ShapeHandle input_handle_shape(int idx);
   DataType input_handle_dtype(int idx) const {
     return input_handle_dtype_[idx];
   }
+
+  // Merge the stored shape corresponding to the output handle in position idx
+  // with the specified shape. This requires idx to be in the [0, num_outputs)
+  // range. If the merge is successful and the new shape differs from the old
+  // one, store the new shape and return true. Return false otherwise.
+
+  bool MergeOutputHandleShape(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(output_handle_shape_[idx], shape, &new_shape).ok() ||
+        output_handle_shape_[idx].SameHandle(new_shape)) {
+      return false;
+    }
+    output_handle_shape_[idx] = shape;
+    return true;
+  }
+  // Overwrite the shape corresponding to the output handle in position idx with
+  // the specified shape.
   void set_output_handle_shape(int idx, ShapeHandle shape) {
     output_handle_shape_[idx] = shape;
   }
-  void set_output_handle_dtype(int idx, DataType dtype) {
-    output_handle_dtype_[idx] = dtype;
+
+  // Set the type corresponding to the resource in position idx. This requires
+  // idx to be in the [0, num_outputs) range. Returns true iff the stored type
+  // has been updated.
+  bool set_output_handle_dtype(int idx, DataType dtype) {
+    if (output_handle_dtype_[idx] != dtype) {
+      output_handle_dtype_[idx] = dtype;
+      return true;
+    }
+    return false;
   }
   ShapeHandle output_handle_shape(int idx) const {
     return output_handle_shape_[idx];
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index c82b506e4b939c42117b56d520484d4edaf24534..78d1fc0fc5e8bfa7235fb9b90cbaa39270219fd3 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -558,6 +558,11 @@ TEST_F(ShapeInferenceTest, MergeShape) {
   EXPECT_TRUE(SameHandle(c.Dim(s_1_u, 0), c.Dim(out, 0)));
   EXPECT_TRUE(SameHandle(c.Dim(s_u_2, 1), c.Dim(out, 1)));
 
+  auto s_u1 = c.UnknownShapeOfRank(1);
+  auto s_u2 = c.UnknownShapeOfRank(1);
+  TF_EXPECT_OK(c.Merge(s_u1, s_u2, &out));
+  EXPECT_TRUE(SameHandle(s_u1, out));
+
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
   EXPECT_TRUE(
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index ecb9810d83c9577ca89e12e30167b0d8f4c78f5b..d049da1c9d5ce16506527388d9c42086db9dcec2 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -902,42 +902,27 @@ void Tensor::FillDescription(TensorDescription* description) const {
 }
 
 gtl::InlinedVector<int64, 4> Tensor::ComputeFlatInnerDims(
-    int64 num_out_dims) const {
-  if (num_out_dims == dims()) {
-    return shape_.dim_sizes();
-  }
+    gtl::ArraySlice<int64> orig, int64 num_out_dims) {
   gtl::InlinedVector<int64, 4> out_dims(num_out_dims, 0);
-  const int64 num_elements = NumElements();
-  int64 prod_out_dims = 1;
-  for (int64 out_dim = num_out_dims - 1; out_dim > 0; --out_dim) {
-    const int64 in_dim = out_dim + (dims() - num_out_dims);
-    out_dims[out_dim] = (in_dim >= dims() || in_dim < 0) ? 1 : dim_size(in_dim);
-    prod_out_dims *= out_dims[out_dim];
-  }
-  if (prod_out_dims != 0) {
-    out_dims[0] = num_elements / prod_out_dims;
-  } else {
-    out_dims[0] = 0;
+  int64 offset = orig.size() - num_out_dims;
+  for (int64 out_dim = num_out_dims - 1; out_dim >= 0; --out_dim) {
+    const int64 in_dim = out_dim + offset;
+    out_dims[out_dim] = in_dim < 0 ? 1 : orig[in_dim];
+  }
+  for (int64 in_dim = 0; in_dim < offset; ++in_dim) {
+    out_dims[0] *= orig[in_dim];
   }
   return out_dims;
 }
 
 gtl::InlinedVector<int64, 4> Tensor::ComputeFlatOuterDims(
-    int64 num_out_dims) const {
-  if (num_out_dims == dims()) {
-    return shape_.dim_sizes();
-  }
+    gtl::ArraySlice<int64> orig, int64 num_out_dims) {
   gtl::InlinedVector<int64, 4> out_dims(num_out_dims, 0);
-  const int64 num_elements = NumElements();
-  int64 prod_out_dims = 1;
-  for (int64 out_dim = 0; out_dim < num_out_dims - 1; ++out_dim) {
-    out_dims[out_dim] = out_dim >= dims() ? 1 : dim_size(out_dim);
-    prod_out_dims *= out_dims[out_dim];
-  }
-  if (prod_out_dims != 0) {
-    out_dims[num_out_dims - 1] = num_elements / prod_out_dims;
-  } else {
-    out_dims[num_out_dims - 1] = 0;
+  for (int64 out_dim = 0; out_dim <= num_out_dims - 1; ++out_dim) {
+    out_dims[out_dim] = out_dim >= orig.size() ? 1 : orig[out_dim];
+  }
+  for (int64 in_dim = num_out_dims; in_dim < orig.size(); ++in_dim) {
+    out_dims[num_out_dims - 1] *= orig[in_dim];
   }
   return out_dims;
 }
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 103da4c1b373076d35189f9462171f3345b82e1a..5810970a38adf2786d27eab34adb94fbd77735d8 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -103,9 +103,9 @@ class Tensor {
   /// Copy constructor.
   Tensor(const Tensor& other);
 
-  /// \brief Move constructor. After this call, <other> is safely destructible and can
-  /// be assigned to, but other calls on it (e.g. shape manipulation) are not
-  /// valid.
+  /// \brief Move constructor. After this call, <other> is safely destructible
+  /// and can be assigned to, but other calls on it (e.g. shape manipulation)
+  /// are not valid.
   Tensor(Tensor&& other);
 
   ~Tensor();
@@ -304,6 +304,15 @@ class Tensor {
   template <typename T, size_t NDIMS = 2>
   typename TTypes<T, NDIMS>::Tensor flat_outer_dims();
 
+  /// Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing the
+  /// first 'begin' Tensor dimensions into the first dimension of the result and
+  /// the Tensor dimensions of the last dims() - 'begin' - NDIMS into the last
+  /// dimension of the result. If 'begin' < 0 then the the |'begin'| leading
+  /// dimensions of size 1 will be added. If 'begin' + NDIMS > dims() then
+  /// 'begin' + NDIMS - dims() trailing dimensions of size 1 will be added.
+  template <typename T, size_t NDIMS = 3>
+  typename TTypes<T, NDIMS>::Tensor flat_inner_outer_dims(int64 begin);
+
   template <typename T, size_t NDIMS>
   typename TTypes<T, NDIMS>::Tensor shaped(gtl::ArraySlice<int64> new_sizes);
 
@@ -386,6 +395,9 @@ class Tensor {
   template <typename T, size_t NDIMS = 2>
   typename TTypes<T, NDIMS>::ConstTensor flat_outer_dims() const;
 
+  template <typename T, size_t NDIMS = 3>
+  typename TTypes<T, NDIMS>::Tensor flat_inner_outer_dims(int64 begin) const;
+
   /// Render the first `max_entries` values in `*this` into a string.
   string SummarizeValue(int64 max_entries) const;
 
@@ -429,10 +441,11 @@ class Tensor {
       gtl::ArraySlice<int64> new_sizes,
       Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
 
-  // TODO(rmlarsen): These shouldn't hardcode '4' so that it lines up with
   // TensorShape's InlineVector.
-  gtl::InlinedVector<int64, 4> ComputeFlatInnerDims(int64 num_out_dims) const;
-  gtl::InlinedVector<int64, 4> ComputeFlatOuterDims(int64 num_out_dims) const;
+  static gtl::InlinedVector<int64, 4> ComputeFlatInnerDims(
+      gtl::ArraySlice<int64> orig, int64 num_out_dims);
+  static gtl::InlinedVector<int64, 4> ComputeFlatOuterDims(
+      gtl::ArraySlice<int64> orig, int64 num_out_dims);
 
   TensorShape shape_;
   TensorBuffer* buf_;
@@ -529,7 +542,6 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::tensor() const {
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_tensor() {
   CHECK(IsAligned());
-  ;
   return typename TTypes<T, NDIMS>::Tensor(base<T>(),
                                            shape().AsEigenDSizes<NDIMS>());
 }
@@ -537,7 +549,6 @@ typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_tensor() {
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_tensor() const {
   CHECK(IsAligned());
-  ;
   return typename TTypes<T, NDIMS>::ConstTensor(base<const T>(),
                                                 shape().AsEigenDSizes<NDIMS>());
 }
@@ -568,7 +579,6 @@ template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) {
   CHECK(IsAligned());
-  ;
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape<NDIMS>(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
@@ -609,7 +619,6 @@ template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) const {
   CHECK(IsAligned());
-  ;
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(&dims, new_sizes);
   return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
@@ -638,22 +647,36 @@ typename TTypes<T>::ConstScalar Tensor::scalar() const {
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_dims() {
-  return shaped<T, NDIMS>(ComputeFlatInnerDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(shape_.dim_sizes(), NDIMS));
 }
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_outer_dims() {
-  return shaped<T, NDIMS>(ComputeFlatOuterDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatOuterDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_outer_dims(int64 begin) {
+  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
+      shape_.dim_sizes(), begin + NDIMS);
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_dims() const {
-  return shaped<T, NDIMS>(ComputeFlatInnerDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(shape_.dim_sizes(), NDIMS));
 }
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_outer_dims() const {
-  return shaped<T, NDIMS>(ComputeFlatOuterDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatOuterDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_outer_dims(int64 begin) const {
+  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
+      shape_.dim_sizes(), begin + NDIMS);
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
 inline Tensor::Tensor(const Tensor& other)
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index c907bbb69fe418d898b8404e18582b0620c8f540..2626402ccd5b8fb563630c282a64ab26bef8afd7 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -202,11 +202,19 @@ TEST(Tensor_QInt32, Simple) {
   TestCopies<qint32>(t);
 }
 
-TEST(Tensor_Float, Reshape) {
-  Tensor t(DT_FLOAT, TensorShape({2, 3, 4, 5}));
-  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 3, 4, 5})));
+class TensorReshapeTest : public ::testing::Test {
+ protected:
+  Tensor t;
+  Tensor zero_t;
+
+  TensorReshapeTest()
+      : t(DT_FLOAT, TensorShape({2, 3, 4, 5})),
+        zero_t(DT_FLOAT, TensorShape({3, 0, 2, 0, 5})) {}
+
+  virtual void SetUp() {
+    EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 3, 4, 5})));
+    EXPECT_TRUE(zero_t.shape().IsSameSize(TensorShape({3, 0, 2, 0, 5})));
 
-  {
     auto tensor = t.tensor<float, 4>();
     EXPECT_EQ(2, tensor.dimension(0));
     EXPECT_EQ(3, tensor.dimension(1));
@@ -217,6 +225,10 @@ TEST(Tensor_Float, Reshape) {
     tensor(0, 0, 0, 0) = 0.01f;
     tensor(1, 2, 3, 4) = 0.02f;
   }
+};
+
+TEST_F(TensorReshapeTest, Reshape) {
+  LOG(INFO) << "shaped";
   {
     auto shaped = t.shaped<float, 1>({120});
     EXPECT_EQ(120, shaped.dimension(0));
@@ -248,6 +260,10 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(shaped(0, 0, 0, 0), 0.01f);
     EXPECT_EQ(shaped(1, 2, 3, 4), 0.02f);
   }
+}
+
+TEST_F(TensorReshapeTest, Flat) {
+  LOG(INFO) << "flat";
   {
     auto flat = t.flat<float>();
     EXPECT_EQ(flat(0), 0.01f);
@@ -255,6 +271,10 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat(0), 0.01f);
     EXPECT_EQ(flat(119), 0.02f);
   }
+}
+
+TEST_F(TensorReshapeTest, FlatInnerDims) {
+  LOG(INFO) << "flat_inner_dims";
   {
     auto flat_inner_dims = t.flat_inner_dims<float>();
     EXPECT_EQ(24, flat_inner_dims.dimension(0));
@@ -262,13 +282,6 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_inner_dims(0, 0), 0.01f);
     EXPECT_EQ(flat_inner_dims(23, 4), 0.02f);
   }
-  {
-    auto flat_outer_dims = t.flat_outer_dims<float>();
-    EXPECT_EQ(2, flat_outer_dims.dimension(0));
-    EXPECT_EQ(60, flat_outer_dims.dimension(1));
-    EXPECT_EQ(flat_outer_dims(0, 0), 0.01f);
-    EXPECT_EQ(flat_outer_dims(1, 59), 0.02f);
-  }
   {
     auto flat_inner_dims = t.flat_inner_dims<float, 3>();
     EXPECT_EQ(6, flat_inner_dims.dimension(0));
@@ -277,14 +290,6 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_inner_dims(0, 0, 0), 0.01f);
     EXPECT_EQ(flat_inner_dims(5, 3, 4), 0.02f);
   }
-  {
-    auto flat_outer_dims = t.flat_outer_dims<float, 3>();
-    EXPECT_EQ(2, flat_outer_dims.dimension(0));
-    EXPECT_EQ(3, flat_outer_dims.dimension(1));
-    EXPECT_EQ(20, flat_outer_dims.dimension(2));
-    EXPECT_EQ(flat_outer_dims(0, 0, 0), 0.01f);
-    EXPECT_EQ(flat_outer_dims(1, 2, 19), 0.02f);
-  }
   {
     auto flat_inner_dims = t.flat_inner_dims<float, 5>();
     EXPECT_EQ(1, flat_inner_dims.dimension(0));
@@ -295,6 +300,44 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_inner_dims(0, 0, 0, 0, 0), 0.01f);
     EXPECT_EQ(flat_inner_dims(0, 1, 2, 3, 4), 0.02f);
   }
+  {
+    auto flat_inner_dims = zero_t.flat_inner_dims<float>();
+    EXPECT_EQ(0, flat_inner_dims.dimension(0));
+    EXPECT_EQ(5, flat_inner_dims.dimension(1));
+  }
+  {
+    auto flat_inner_dims = zero_t.flat_inner_dims<float, 3>();
+    EXPECT_EQ(0, flat_inner_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_dims.dimension(2));
+  }
+  {
+    auto flat_inner_dims = zero_t.flat_inner_dims<float, 5>();
+    EXPECT_EQ(3, flat_inner_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_dims.dimension(2));
+    EXPECT_EQ(0, flat_inner_dims.dimension(3));
+    EXPECT_EQ(5, flat_inner_dims.dimension(4));
+  }
+}
+
+TEST_F(TensorReshapeTest, FlatOuterDims) {
+  LOG(INFO) << "flat_outer_dims";
+  {
+    auto flat_outer_dims = t.flat_outer_dims<float>();
+    EXPECT_EQ(2, flat_outer_dims.dimension(0));
+    EXPECT_EQ(60, flat_outer_dims.dimension(1));
+    EXPECT_EQ(flat_outer_dims(0, 0), 0.01f);
+    EXPECT_EQ(flat_outer_dims(1, 59), 0.02f);
+  }
+  {
+    auto flat_outer_dims = t.flat_outer_dims<float, 3>();
+    EXPECT_EQ(2, flat_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_outer_dims.dimension(1));
+    EXPECT_EQ(20, flat_outer_dims.dimension(2));
+    EXPECT_EQ(flat_outer_dims(0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_outer_dims(1, 2, 19), 0.02f);
+  }
   {
     auto flat_outer_dims = t.flat_outer_dims<float, 5>();
     EXPECT_EQ(2, flat_outer_dims.dimension(0));
@@ -305,8 +348,6 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_outer_dims(0, 0, 0, 0, 0), 0.01f);
     EXPECT_EQ(flat_outer_dims(1, 2, 3, 4, 0), 0.02f);
   }
-
-  Tensor zero_t(DT_FLOAT, TensorShape({3, 0, 2, 0, 5}));
   {
     auto flat_outer_dims = zero_t.flat_outer_dims<float>();
     EXPECT_EQ(3, flat_outer_dims.dimension(0));
@@ -326,24 +367,132 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(0, flat_outer_dims.dimension(3));
     EXPECT_EQ(5, flat_outer_dims.dimension(4));
   }
+}
+
+TEST_F(TensorReshapeTest, FlatInnerOuterDims) {
+  LOG(INFO) << "flat_inner_outer_dims";
   {
-    auto flat_inner_dims = zero_t.flat_inner_dims<float>();
-    EXPECT_EQ(0, flat_inner_dims.dimension(0));
-    EXPECT_EQ(5, flat_inner_dims.dimension(1));
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 4>(0);
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(1, 2, 3, 4), 0.02f);
   }
   {
-    auto flat_inner_dims = zero_t.flat_inner_dims<float, 3>();
-    EXPECT_EQ(0, flat_inner_dims.dimension(0));
-    EXPECT_EQ(0, flat_inner_dims.dimension(1));
-    EXPECT_EQ(5, flat_inner_dims.dimension(2));
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 6>(-2);
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(5));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 1, 2, 3, 4), 0.02f);
   }
   {
-    auto flat_inner_dims = zero_t.flat_inner_dims<float, 5>();
-    EXPECT_EQ(3, flat_inner_dims.dimension(0));
-    EXPECT_EQ(0, flat_inner_dims.dimension(1));
-    EXPECT_EQ(2, flat_inner_dims.dimension(2));
-    EXPECT_EQ(0, flat_inner_dims.dimension(3));
-    EXPECT_EQ(5, flat_inner_dims.dimension(4));
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 6>(0);
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(5));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(1, 2, 3, 4, 0, 0), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 8>(-2);
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(5));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(6));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(7));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 1, 2, 3, 4, 0, 0), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 3>(1);
+    EXPECT_EQ(6, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(5, 3, 4), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 5>(1);
+    EXPECT_EQ(6, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(5, 3, 4, 0, 0), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 3>(0);
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(20, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(1, 2, 19), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 5>(-2);
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(20, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 1, 2, 19), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 2>(1);
+    EXPECT_EQ(6, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(20, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(5, 19), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 2>(0);
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 3>(0);
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(2));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 5>(0);
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(4));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 2>(3);
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(1));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 3>(2);
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(2));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 3>(1);
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(2));
   }
 }
 
@@ -810,7 +959,8 @@ TEST(Tensor, Slice_Basic) {
 
 namespace {
 template <typename T>
-Tensor MkTensor(DataType dt, TensorShape shape, std::vector<T> init_values) {
+Tensor MkTensor(DataType dt, const TensorShape& shape,
+                std::vector<T> init_values) {
   Tensor x(dt, shape);
   const int limit = x.NumElements();
   int vi = 0;
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index a374f848a1789ba90bca5bb54a437202a15c3089..dc396e468ae8ebfc357b95ff6419b20d3ac3b5ff 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -169,7 +169,9 @@ bool DataTypeFromString(StringPiece sp, DataType* dt) {
   return false;
 }
 
-string DeviceTypeString(DeviceType device_type) { return device_type.type(); }
+string DeviceTypeString(const DeviceType& device_type) {
+  return device_type.type();
+}
 
 string DataTypeSliceString(const DataTypeSlice types) {
   string out;
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 932d788f230bccd434316146fb8d9ce69cd0eb62..0a81b1cb9f300a1734ddb7cd2e80fb1077d45d52 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -82,7 +82,7 @@ typedef gtl::InlinedVector<DeviceType, 4> DeviceTypeVector;
 
 // Convert the enums to strings for errors:
 string DataTypeString(DataType dtype);
-string DeviceTypeString(DeviceType device_type);
+string DeviceTypeString(const DeviceType& device_type);
 string DataTypeSliceString(const DataTypeSlice dtypes);
 inline string DataTypeVectorString(const DataTypeVector& dtypes) {
   return DataTypeSliceString(dtypes);
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 38f011ecaf1308f88d89a294bac456309e0614b3..3bfba3fc4ee8fd02abf3adacfbbe81f437cfc443 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-void DFS(const Graph& g, std::function<void(Node*)> enter,
-         std::function<void(Node*)> leave) {
+void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+         const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
@@ -61,15 +61,23 @@ void DFS(const Graph& g, std::function<void(Node*)> enter,
   }
 }
 
-void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave) {
+void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave) {
+  ReverseDFSFrom(g, {g.sink_node()}, enter, leave);
+}
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
     bool leave;  // Are we entering or leaving n?
   };
-  std::vector<Work> stack;
-  stack.push_back(Work{g.sink_node(), false});
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
 
   std::vector<bool> visited(g.num_node_ids(), false);
   while (!stack.empty()) {
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 74aace8072270f3daa6fa03dac0bad5d5c86fd62..01d36e0a12403c6fc9b3db0d2c73205d0c002197 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -21,20 +21,28 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
 // Perform a depth-first-search on g starting at the source node.
 // If enter is not empty, calls enter(n) before visiting any children of n.
 // If leave is not empty, calls leave(n) after visiting all children of n.
-extern void DFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave);
+extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave);
 
 // Perform a reverse depth-first-search on g starting at the sink node.
 // If enter is not empty, calls enter(n) before visiting any parents of n.
 // If leave is not empty, calls leave(n) after visiting all parents of n.
-extern void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                       std::function<void(Node*)> leave);
+extern void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                       const std::function<void(Node*)>& leave);
+
+// Perform a reverse depth-first-search on g starting at the 'start' nodes.
+// If enter is not empty, calls enter(n) before visiting any parents of n.
+// If leave is not empty, calls leave(n) after visiting all parents of n.
+extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                           const std::function<void(Node*)>& enter,
+                           const std::function<void(Node*)>& leave);
 
 // Stores in *order the post-order numbering of all nodes
 // in graph found via a depth first search starting at the source node.
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 8409fb4cd0b41b3f36dc5b4ba7a3abd0478a7994..db6683d1e74512e37a40773b7642cf33eb888782 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -88,7 +88,7 @@ Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
           out_info->frame = out;
           out_info->parent_frame = frame;
           TF_RETURN_IF_ERROR(
-              GetNodeAttr(out->def(), "frame_name", &out_info->frame_name));
+              GetNodeAttr(out->attrs(), "frame_name", &out_info->frame_name));
           if (out_info->frame_name.empty()) {
             return errors::InvalidArgument("The Enter node ", out->name(),
                                            " must have a frame name.");
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index fae9f26f7628080fc9ab219ed0e62e2cadbf824e..9066de5668076687d11cd938d4fce38dab9c248b 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -30,6 +30,45 @@ const int Graph::kControlSlot = -1;
 
 // Node
 
+#define REF_CLASS(key, value) \
+  {key, value}, { "Ref" key, value }
+
+const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
+    *new std::unordered_map<string, Node::NodeClass>({
+        // Keep in same order as NodeClass values
+        REF_CLASS("Switch", NC_SWITCH),
+        REF_CLASS("Merge", NC_MERGE),
+        REF_CLASS("Enter", NC_ENTER),
+        REF_CLASS("Exit", NC_EXIT),
+        REF_CLASS("NextIteration", NC_NEXT_ITERATION),
+        {"LoopCond", NC_LOOP_COND},
+        {"ControlTrigger", NC_CONTROL_TRIGGER},
+        {"_Send", NC_SEND},
+        {"_HostSend", NC_HOST_SEND},
+        {"_Recv", NC_RECV},
+        {"_HostRecv", NC_HOST_RECV},
+        {"Const", NC_CONSTANT},
+        {"HostConst", NC_CONSTANT},
+        {"Variable", NC_VARIABLE},
+        {"VariableV2", NC_VARIABLE},
+        REF_CLASS("Identity", NC_IDENTITY),
+        {"GetSessionHandle", NC_GET_SESSION_HANDLE},
+        {"GetSessionHandleV2", NC_GET_SESSION_HANDLE},
+        {"GetSessionTensor", NC_GET_SESSION_TENSOR},
+        {"DeleteSessionTensor", NC_DELETE_SESSION_TENSOR},
+    });
+
+#undef REF_CLASS
+
+Node::NodeClass Node::GetNodeClassForOp(const string& ts) {
+  auto it = kNodeClassTable.find(ts);
+  if (it != kNodeClassTable.end()) {
+    return it->second;
+  } else {
+    return NC_OTHER;
+  }
+}
+
 string Node::DebugString() const {
   string ret = strings::StrCat("{name:'", name(), "' id:", id_);
   if (IsSource()) {
@@ -39,7 +78,7 @@ string Node::DebugString() const {
   } else {
     strings::StrAppend(&ret, " op device:");
     strings::StrAppend(&ret, "{", assigned_device_name_, "}");
-    strings::StrAppend(&ret, " def:{", SummarizeNodeDef(def()), "}}");
+    strings::StrAppend(&ret, " def:{", SummarizeNode(*this), "}}");
   }
   return ret;
 }
@@ -70,41 +109,7 @@ void Node::Initialize(int id, int cost_id, Properties* props) {
   }
   props_ = props;
   // Initialize the class_ based on the type string
-  const string& ts = this->type_string();
-  class_ = NC_UNINITIALIZED;
-
-#define SET_CLASS(enum_val, ts, str1, str2)        \
-  do {                                             \
-    if ((((ts) == (str1)) || ((ts) == (str2)))) {  \
-      /* Cannot be member of more than one class*/ \
-      CHECK(class_ == NC_UNINITIALIZED);           \
-      class_ = (enum_val);                         \
-    }                                              \
-  } while (0)
-
-  SET_CLASS(NC_SWITCH, ts, "Switch", "RefSwitch");
-  SET_CLASS(NC_MERGE, ts, "Merge", "RefMerge");
-  SET_CLASS(NC_ENTER, ts, "Enter", "RefEnter");
-  SET_CLASS(NC_EXIT, ts, "Exit", "RefExit");
-  SET_CLASS(NC_NEXT_ITERATION, ts, "NextIteration", "RefNextIteration");
-  SET_CLASS(NC_LOOP_COND, ts, "LoopCond", "");
-  SET_CLASS(NC_CONTROL_TRIGGER, ts, "ControlTrigger", "");
-  SET_CLASS(NC_SEND, ts, "_Send", "");
-  SET_CLASS(NC_HOST_SEND, ts, "_HostSend", "");
-  SET_CLASS(NC_RECV, ts, "_Recv", "");
-  SET_CLASS(NC_HOST_RECV, ts, "_HostRecv", "");
-  SET_CLASS(NC_CONSTANT, ts, "Const", "HostConst");
-  SET_CLASS(NC_VARIABLE, ts, "Variable", "");
-  SET_CLASS(NC_VARIABLE, ts, "VariableV2", "");
-  SET_CLASS(NC_IDENTITY, ts, "Identity", "RefIdentity");
-  SET_CLASS(NC_GET_SESSION_HANDLE, ts, "GetSessionHandle", "");
-  SET_CLASS(NC_GET_SESSION_HANDLE, ts, "GetSessionHandleV2", "");
-  SET_CLASS(NC_GET_SESSION_TENSOR, ts, "GetSessionTensor", "");
-  SET_CLASS(NC_DELETE_SESSION_TENSOR, ts, "DeleteSessionTensor", "");
-  if (class_ == NC_UNINITIALIZED) {
-    class_ = NC_OTHER;  // Catch all
-  }
-#undef SET_CLASS
+  class_ = GetNodeClassForOp(props->node_def_.op());
 }
 
 void Node::Clear() {
@@ -299,6 +304,17 @@ Node* Graph::CopyNode(Node* node) {
   props->Ref();
   Node* copy = AllocateNode(props, node);
   copy->set_assigned_device_name(node->assigned_device_name());
+
+  // Since the OpDef of a function may be owned by the Graph that owns 'node',
+  // relookup the OpDef in the target graph. If it differs, then clone the
+  // node properties with the updated OpDef.
+  const OpDef* op_def;
+  TF_CHECK_OK(ops_.LookUpOpDef(node->type_string(), &op_def));
+  if (op_def != props->op_def_) {
+    copy->MaybeCopyOnWrite();
+    copy->props_->op_def_ = op_def;
+  }
+
   return copy;
 }
 
@@ -344,7 +360,7 @@ const Edge* Graph::AddEdge(Node* source, int x, Node* dest, int y) {
   CHECK(source->out_edges_.insert(e).second);
   CHECK(dest->in_edges_.insert(e).second);
   edges_.push_back(e);
-  edge_set_.insert(e);
+  ++num_edges_;
   return e;
 }
 
@@ -354,8 +370,8 @@ void Graph::RemoveEdge(const Edge* e) {
   CHECK_EQ(e->src_->out_edges_.erase(e), size_t{1});
   CHECK_EQ(e->dst_->in_edges_.erase(e), size_t{1});
   CHECK_EQ(e, edges_[e->id_]);
+  CHECK_GT(num_edges_, 0);
 
-  CHECK_EQ(edge_set_.erase(e), size_t{1});
   edges_[e->id_] = nullptr;
 
   Edge* del = const_cast<Edge*>(e);
@@ -365,6 +381,7 @@ void Graph::RemoveEdge(const Edge* e) {
   del->src_output_ = kControlSlot - 1;
   del->dst_input_ = kControlSlot - 1;
   free_edges_.push_back(del);
+  --num_edges_;
 }
 
 Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
@@ -380,13 +397,6 @@ Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
       // Ignore duplicate FunctionDefs
       continue;
     }
-    // TODO(skyewm): fix test breakages and reenable this check
-    // const OpDef* op_def;
-    // if (ops_.LookUpOpDef(fdef.signature().name(), &op_def).ok()) {
-    //   return errors::InvalidArgument(
-    //       "Cannot add function '", fdef.signature().name(),
-    //       "' because an op with the same name already exists.");
-    // }
     TF_RETURN_IF_ERROR(ops_.AddFunctionDef(fdef));
   }
   for (const GradientDef& grad : fdef_lib.gradient()) {
@@ -464,7 +474,7 @@ void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
     for (size_t i = 0; i < inputs.size(); ++i) {
       const Edge* edge = inputs[i];
       if (edge == nullptr) {
-        node_def->add_input(node->def().input(i));
+        node_def->add_input(node->requested_inputs()[i]);
       } else {
         const Node* src = edge->src();
         if (!src->IsOp()) continue;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 11a49ec3b3d77fce58ba1e2f729a8577148e1d13..8554cb2f4b7aa8b58fbc2b4218877fe88499d193 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -71,6 +71,7 @@ class Node {
   int cost_id() const { return cost_id_; }
   const string& name() const { return props_->node_def_.name(); }
   const string& type_string() const { return props_->node_def_.op(); }
+
   // def() provides the NodeDef the user supplied, but the specifics
   // of this Node may have changed due to placement, optimization, etc.
   // In particular:
@@ -80,6 +81,7 @@ class Node {
   // * def().device() is the "user's requested device" and may not match
   //   the actual assigned device, see assigned_device_name() below;
   // * def().attr() is authoritative.
+  // TODO(irving): Replace with NodeInfo.
   const NodeDef& def() const { return props_->node_def_; }
   const OpDef& op_def() const { return *props_->op_def_; }
 
@@ -92,6 +94,10 @@ class Node {
   DataType output_type(int32 o) const { return props_->output_types_[o]; }
   const DataTypeVector& output_types() const { return props_->output_types_; }
 
+  // The device requested by the user.  For the actual assigned device,
+  // use assigned_device_name() below.
+  const string& requested_device() const { return def().device(); }
+
   // This gives the device the runtime has assigned this node to.  If
   // you want the device the user requested, use def().device() instead.
   // TODO(josh11b): Validate that the assigned_device, if not empty:
@@ -103,6 +109,14 @@ class Node {
     assigned_device_name_ = device_name;
   }
 
+  // Read only access to attributes
+  AttrSlice attrs() const { return AttrSlice(def()); }
+
+  // Inputs requested by the NodeDef.  For the actual inputs, use in_edges.
+  const protobuf::RepeatedPtrField<string>& requested_inputs() const {
+    return def().input();
+  }
+
   // Get the neighboring nodes via edges either in or out of this node.
   gtl::iterator_range<NeighborIter> in_nodes() const;
   gtl::iterator_range<NeighborIter> out_nodes() const;
@@ -221,6 +235,10 @@ class Node {
     NC_OTHER  // Not a special kind of node
   };
 
+  static const std::unordered_map<string, NodeClass>& kNodeClassTable;
+
+  static NodeClass GetNodeClassForOp(const string& ts);
+
   int id_;       // -1 until Initialize() is called
   int cost_id_;  // -1 if there is no corresponding cost accounting node
   NodeClass class_;
@@ -268,6 +286,66 @@ class Edge {
   int dst_input_;
 };
 
+// Allows for iteration of the edges of a Graph, by iterating the underlying
+// Graph.edges_ vector while skipping over null entries.
+class GraphEdgesIterable {
+ private:
+  const std::vector<Edge*>& edges_;
+
+ public:
+  explicit GraphEdgesIterable(const std::vector<Edge*>& edges)
+      : edges_(edges) {}
+
+  typedef Edge* value_type;
+
+  class const_iterator {
+   private:
+    // The underlying iterator.
+    std::vector<value_type>::const_iterator iter_;
+
+    // The end of the underlying iterator.
+    std::vector<value_type>::const_iterator end_;
+
+    // Advances iter_ until it reaches a non-null item, or reaches the end.
+    void apply_filter() {
+      while (iter_ != end_ && *iter_ == nullptr) {
+        ++iter_;
+      }
+    }
+
+   public:
+    const_iterator(std::vector<value_type>::const_iterator iter,
+                   std::vector<value_type>::const_iterator end)
+        : iter_(iter), end_(end) {
+      apply_filter();
+    }
+
+    bool operator==(const const_iterator& other) const {
+      return iter_ == other.iter_;
+    }
+
+    bool operator!=(const const_iterator& other) const {
+      return iter_ != other.iter_;
+    }
+
+    // This is the prefix increment operator (++x), which is the operator
+    // used by C++ range iteration (for (x : y) ...).  We intentionally do not
+    // provide a postfix increment operator.
+    const_iterator& operator++() {
+      ++iter_;
+      apply_filter();
+      return *this;
+    }
+
+    value_type operator*() { return *iter_; }
+  };
+
+  const_iterator begin() {
+    return const_iterator(edges_.begin(), edges_.end());
+  }
+  const_iterator end() { return const_iterator(edges_.end(), edges_.end()); }
+};
+
 // Thread compatible but not thread safe.
 class Graph {
  public:
@@ -345,7 +423,7 @@ class Graph {
   // smaller than num_edge_ids(). If one needs to create an array of
   // edges indexed by edge ids, num_edge_ids() should be used as the
   // array's size.
-  int num_edges() const { return edges().size(); }
+  int num_edges() const { return num_edges_; }
 
   // Serialize the nodes starting at `from_node_id` to a GraphDef.
   void ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const;
@@ -381,7 +459,7 @@ class Graph {
 
   // Access to the set of all edges.  Example usage:
   //   for (const Edge* e : graph.edges()) { ... }
-  const EdgeSet& edges() const { return edge_set_; }
+  GraphEdgesIterable edges() const { return GraphEdgesIterable(edges_); }
 
   // The pre-defined nodes.
   enum { kSourceId = 0, kSinkId = 1 };
@@ -421,9 +499,8 @@ class Graph {
   // the edge with that id was removed from the graph.
   std::vector<Edge*> edges_;
 
-  // For ease of iteration, we currently just keep a set of all live
-  // edges.  May want to optimize by removing this copy.
-  EdgeSet edge_set_;
+  // The number of entries in edges_ that are not nullptr.
+  int num_edges_ = 0;
 
   // Allocated but free nodes and edges.
   std::vector<Node*> free_nodes_;
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 9d4a0a52f75b3a3d303435a88924070419c5841d..70087b8fe1590f2849d949cdc233e1eae309f18d 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -424,7 +424,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
   // For nodes with the _output_shapes atttribute, override the shape.
   std::vector<TensorShapeProto> shape_attrs;
   const char* kAttrName = "_output_shapes";
-  if (!GetNodeAttr(node->def(), kAttrName, &shape_attrs).ok()) {
+  if (!GetNodeAttr(node->attrs(), kAttrName, &shape_attrs).ok()) {
     // No _output_shapes attribute, the AddNode call above was sufficient.
     return Status::OK();
   }
@@ -458,7 +458,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
       // functions that are not critical to correct execution but
       // would cause graphs to fail if imported after correcting.
       //
-      const string& op = node->def().op();
+      const string& op = node->type_string();
       const std::vector<string> whitelist = {
           // To be removed after 2017/03/08.
           "RandomShuffleQueue", "PaddingFIFOQueue", "FIFOQueue",
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index 186859d132af72d1afc417311db6f5a2d497bbd3..9b80f211fc6522ed8f134a530fb7897648bfbaad 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -24,15 +24,6 @@ limitations under the License.
 namespace tensorflow {
 class ShapeRefiner;
 
-// Options specific to constant folding optimizations.
-//
-// TODO(ashankar,vrv): This should move to where constant folding is done.
-struct ConstantFoldingOptions {
-  // If "consider" is not a nullptr, then only constant fold a node "n" if
-  // consider(n) returns true.
-  std::function<bool(const Node*)> consider = nullptr;
-};
-
 // Construct a Graph *g out of a GraphDef gdef. Returns non-OK on
 // error, in which case *g is left in an incomplete state.
 //
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index e3b7f322cb6e98953fc9545e07646d5326e264c6..6013b2ff512e74febead624731d59fabc65c13ee 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -146,7 +146,7 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     std::vector<string> value;
-    Status s = GetNodeAttr(n->def(), kColocationAttrName, &value);
+    Status s = GetNodeAttr(n->attrs(), kColocationAttrName, &value);
     if (!s.ok()) {
       return "";
     }
@@ -997,7 +997,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_DefaultAttrs) {
   }
   ASSERT_TRUE(a != nullptr);
   int value = 0;
-  s = GetNodeAttr(a->def(), "default_int", &value);
+  s = GetNodeAttr(a->attrs(), "default_int", &value);
   ASSERT_EQ(Status::OK(), s) << s << " -- " << a->def().DebugString();
   EXPECT_EQ(31415, value);
 }
@@ -1201,9 +1201,9 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMap) {
 
   // Check that t1's NodeDef is consistent with graph
   Node* t1 = FindNode("t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  ASSERT_EQ(t1->def().input(0), "input:1");
-  ASSERT_EQ(t1->def().input(1), "input:0");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  ASSERT_EQ(t1->requested_inputs()[0], "input:1");
+  ASSERT_EQ(t1->requested_inputs()[1], "input:0");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithPrefix) {
@@ -1254,19 +1254,19 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithPrefix) {
 
   // Check that NodeDefs are consistent with graph
   Node* t1 = FindNode("import/t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  EXPECT_EQ(t1->def().input(0), "input:0");
-  EXPECT_EQ(t1->def().input(1), "input:0");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  EXPECT_EQ(t1->requested_inputs()[0], "input:0");
+  EXPECT_EQ(t1->requested_inputs()[1], "input:0");
 
   Node* t2 = FindNode("import/t2");
-  ASSERT_EQ(t2->def().input_size(), 2);
-  EXPECT_EQ(t2->def().input(0), "import/t1:0");
-  EXPECT_EQ(t2->def().input(1), "import/t1:0");
+  ASSERT_EQ(t2->requested_inputs().size(), 2);
+  EXPECT_EQ(t2->requested_inputs()[0], "import/t1:0");
+  EXPECT_EQ(t2->requested_inputs()[1], "import/t1:0");
 
   Node* t3 = FindNode("import/t3");
-  ASSERT_EQ(t3->def().input_size(), 2);
-  EXPECT_EQ(t3->def().input(0), "import/unmapped_input:0");
-  EXPECT_EQ(t3->def().input(1), "import/unmapped_input:1");
+  ASSERT_EQ(t3->requested_inputs().size(), 2);
+  EXPECT_EQ(t3->requested_inputs()[0], "import/unmapped_input:0");
+  EXPECT_EQ(t3->requested_inputs()[1], "import/unmapped_input:1");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithControlEdges) {
@@ -1795,24 +1795,24 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDeps) {
 
   // Test that node defs are consistent with graph
   Node* w1 = FindNode("import/W1");
-  ASSERT_EQ(w1->def().input_size(), 2);
-  EXPECT_EQ(w1->def().input(0), "^W1");
-  EXPECT_EQ(w1->def().input(1), "^W2");
+  ASSERT_EQ(w1->requested_inputs().size(), 2);
+  EXPECT_EQ(w1->requested_inputs()[0], "^W1");
+  EXPECT_EQ(w1->requested_inputs()[1], "^W2");
 
   Node* input = FindNode("import/input");
-  ASSERT_EQ(input->def().input_size(), 2);
-  EXPECT_EQ(input->def().input(0), "^W1");
-  EXPECT_EQ(input->def().input(1), "^W2");
+  ASSERT_EQ(input->requested_inputs().size(), 2);
+  EXPECT_EQ(input->requested_inputs()[0], "^W1");
+  EXPECT_EQ(input->requested_inputs()[1], "^W2");
 
   Node* input2 = FindNode("import/input2");
-  ASSERT_EQ(input2->def().input_size(), 2);
-  EXPECT_EQ(input2->def().input(0), "^W1");
-  EXPECT_EQ(input2->def().input(1), "^W2");
+  ASSERT_EQ(input2->requested_inputs().size(), 2);
+  EXPECT_EQ(input2->requested_inputs()[0], "^W1");
+  EXPECT_EQ(input2->requested_inputs()[1], "^W2");
 
   Node* t1 = FindNode("import/t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  EXPECT_EQ(t1->def().input(0), "import/input:0");
-  EXPECT_EQ(t1->def().input(1), "import/input:1");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  EXPECT_EQ(t1->requested_inputs()[0], "import/input:0");
+  EXPECT_EQ(t1->requested_inputs()[1], "import/input:1");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsWithCycle) {
@@ -1856,15 +1856,15 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsWithCycle) {
 
   // Test that node defs are consistent with graph
   Node* merge = FindNode("merge");
-  ASSERT_EQ(merge->def().input_size(), 3);
-  EXPECT_EQ(merge->def().input(0), "input:0");
-  EXPECT_EQ(merge->def().input(1), "t1:0");
-  EXPECT_EQ(merge->def().input(2), "^W1");
+  ASSERT_EQ(merge->requested_inputs().size(), 3);
+  EXPECT_EQ(merge->requested_inputs()[0], "input:0");
+  EXPECT_EQ(merge->requested_inputs()[1], "t1:0");
+  EXPECT_EQ(merge->requested_inputs()[2], "^W1");
 
   Node* t1 = FindNode("t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  EXPECT_EQ(t1->def().input(0), "merge:0");
-  EXPECT_EQ(t1->def().input(1), "merge:0");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  EXPECT_EQ(t1->requested_inputs()[0], "merge:0");
+  EXPECT_EQ(t1->requested_inputs()[1], "merge:0");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsErrors) {
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index c7ad6a1e77b29e4b839409223468ec88e3476921..57a2f399e0e074ff0a2c671e8d12e0090006cfd3 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -356,7 +356,7 @@ string ControlLoopName(const string& name) {
 }
 
 bool IsControlLoop(const Node* node) {
-  const string& name = node->def().name();
+  const string& name = node->name();
   return StringPiece(name).starts_with("_cloop");
 }
 
@@ -468,7 +468,7 @@ Status AddControlLoop(const PartitionOptions& opts, Graph* g, const Node* src,
   const string& device_name = edge->dst()->assigned_device_name();
   const string& frame_name = src_info.frame_name;
   int parallel_iterations;
-  status = GetNodeAttr(src_info.frame->def(), "parallel_iterations",
+  status = GetNodeAttr(src_info.frame->attrs(), "parallel_iterations",
                        &parallel_iterations);
   if (!status.ok()) return status;
 
@@ -903,11 +903,11 @@ Status Partition(const PartitionOptions& opts, Graph* g,
           send_start_time = opts.start_times[src->id()].value();
           recv_start_time = opts.start_times[dst->id()].value();
         } else {
-          status = GetNodeAttr(src->def(), "_start_time", &send_start_time);
+          status = GetNodeAttr(src->attrs(), "_start_time", &send_start_time);
           if (!status.ok()) {
             return status;
           }
-          status = GetNodeAttr(dst->def(), "_start_time", &recv_start_time);
+          status = GetNodeAttr(dst->attrs(), "_start_time", &recv_start_time);
           if (!status.ok()) {
             return status;
           }
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 739ad90efd2bf025b00c0cbdf227a7f50a6958fd..89784c631f002528db5b9d58dab40c68c9fcf173 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -51,8 +51,8 @@ class GraphTest : public ::testing::Test {
   GraphTest() : graph_(OpRegistry::Global()) {}
   ~GraphTest() override {}
 
-  static void VerifyNodes(Node* node, std::vector<Node*> expected_in,
-                          std::vector<Node*> expected_out) {
+  static void VerifyNodes(Node* node, const std::vector<Node*>& expected_in,
+                          const std::vector<Node*>& expected_out) {
     std::vector<Node*> in;
     for (const Edge* e : node->in_edges()) {
       in.push_back(e->src());
@@ -318,21 +318,21 @@ TEST_F(GraphTest, AddAttr) {
   n1->AddAttr("_a", "new_attr");
 
   string attr;
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->def(), "_a", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
 
   Node* n2 = graph_.CopyNode(n1);
 
   n1->AddAttr("_b", "new_attr_2");
 
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->def(), "_a", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->def(), "_b", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->attrs(), "_b", &attr));
   EXPECT_EQ("new_attr_2", attr);
 
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n2->def(), "_a", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n2->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
-  EXPECT_NE(Status::OK(), GetNodeAttr(n2->def(), "_b", &attr));
+  EXPECT_NE(Status::OK(), GetNodeAttr(n2->attrs(), "_b", &attr));
 }
 
 // Convert edge iteration results into a sorted string.
@@ -412,15 +412,14 @@ TEST_F(GraphTest, AddFunctionLibrary) {
             "Cannot add function 'XTimesTwo' because a different function with "
             "the same name already exists.");
 
-  // TODO(skyewm): reenable along with duplicate op check
   // Function with same name as an existing op triggers an error
-  // error_proto = proto;
-  // error_proto.mutable_function(0)->mutable_signature()->set_name("Add");
-  // s = graph_.AddFunctionLibrary(error_proto);
-  // EXPECT_FALSE(s.ok());
-  // EXPECT_EQ(s.error_message(),
-  //           "Cannot add function 'Add' because an op with the same name "
-  //           "already exists.");
+  error_proto = proto;
+  error_proto.mutable_function(0)->mutable_signature()->set_name("Add");
+  s = graph_.AddFunctionLibrary(error_proto);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot add function 'Add' because an op with the same name "
+            "already exists.");
 
   // Adding a gradient function to an existing function is ok
   GradientDef* grad = proto.add_gradient();
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 09b632a1650db25b29dcdbae63a9f88e7c564bd3..94741a11ffa0ca5eb00ff2e9e5834e153f25b4b4 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 #include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/util/mkl_util.h"
@@ -272,6 +273,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.identity = "Identity";
     csinfo_.lrn = "LRN";
     csinfo_.lrn_grad = "LRNGrad";
     csinfo_.matmul = "MatMul";
@@ -280,51 +282,75 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d = "_MklConv2D";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_with_bias_backprop_bias =
-        "_MklConv2DWithBiasBackpropBias";
-    csinfo_.relu = "Relu";
-    csinfo_.reshape = "Reshape";
-    csinfo_.relu_grad = "ReluGrad";
-    csinfo_.split = "Split";
+                                   "_MklConv2DWithBiasBackpropBias";
+    csinfo_.relu                  = "Relu";
+    csinfo_.relu_grad             = "ReluGrad";
+    csinfo_.reshape               = "Reshape";
+    csinfo_.split                 = "Split";
 
     // NOTE: names are alphabetically sorted.
-    rinfo_.push_back({csinfo_.avg_pool, GetMklOpName(csinfo_.avg_pool), 1,
-                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avg_pool,
+                      GetMklOpName(csinfo_.avg_pool),
+                      CopyAttrsPooling, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.avg_pool_grad,
-                      GetMklOpName(csinfo_.avg_pool_grad), 2, CopyAttrsPooling,
-                      AlwaysRewrite});
-    rinfo_.push_back({csinfo_.concat, GetMklOpName(csinfo_.concat), 0,
-                      CopyAttrsConcat, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.concatv2, GetMklOpName(csinfo_.concatv2), 0,
-                      CopyAttrsConcatV2, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.conv2d, GetMklOpName(csinfo_.conv2d), 2,
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      GetMklOpName(csinfo_.avg_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite, nullptr});
+    // BiasAddGrad gets written into Conv2DWithBiasBackpropBias depending
+    // on if context contains Conv2D.
+    rinfo_.push_back({csinfo_.bias_add_grad,
+                      csinfo_.mkl_conv2d_with_bias_backprop_bias,
+                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
+                      &biasaddgrad_conv2dwithbias_context_});
+    // BiasAddGrad gets written into BiasAddGrad depending on if context
+    // contains MatMul.
+    rinfo_.push_back({csinfo_.bias_add_grad, csinfo_.matmul,
+                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
+                      &biasaddgrad_matmul_context_});
+    rinfo_.push_back({csinfo_.concat,
+                      GetMklOpName(csinfo_.concat),
+                      CopyAttrsConcat, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.concatv2,
+                      GetMklOpName(csinfo_.concatv2),
+                      CopyAttrsConcatV2, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.conv2d,
+                      GetMklOpName(csinfo_.conv2d),
+                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
-                      GetMklOpName(csinfo_.conv2d_grad_filter), 3,
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      GetMklOpName(csinfo_.conv2d_grad_filter),
+                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.conv2d_grad_input,
-                      GetMklOpName(csinfo_.conv2d_grad_input), 3,
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      GetMklOpName(csinfo_.conv2d_grad_input),
+                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.fused_batch_norm,
-                      GetMklOpName(csinfo_.fused_batch_norm), 5,
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+                      GetMklOpName(csinfo_.fused_batch_norm),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.fused_batch_norm_grad,
-                      GetMklOpName(csinfo_.fused_batch_norm_grad), 5,
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.lrn, GetMklOpName(csinfo_.lrn), 1, CopyAttrsLRN,
-                      AlwaysRewrite});
-    rinfo_.push_back({csinfo_.lrn_grad, GetMklOpName(csinfo_.lrn_grad), 3,
-                      CopyAttrsLRN, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.max_pool, GetMklOpName(csinfo_.max_pool), 1,
-                      CopyAttrsPooling, AlwaysRewrite});
+                      GetMklOpName(csinfo_.fused_batch_norm_grad),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.identity,
+                      GetMklOpName(csinfo_.identity),
+                      CopyAttrsIdentity, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.lrn,
+                      GetMklOpName(csinfo_.lrn),
+                      CopyAttrsLRN, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.lrn_grad,
+                      GetMklOpName(csinfo_.lrn_grad),
+                      CopyAttrsLRN, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.max_pool,
+                      GetMklOpName(csinfo_.max_pool),
+                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite, nullptr});
     rinfo_.push_back({csinfo_.max_pool_grad,
-                      GetMklOpName(csinfo_.max_pool_grad), 3, CopyAttrsPooling,
-                      AlwaysRewrite});
-    rinfo_.push_back({csinfo_.relu, GetMklOpName(csinfo_.relu), 1,
-                      CopyAttrsRelu, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.reshape, GetMklOpName(csinfo_.reshape), 2,
-                      CopyAttrsReshape, AlwaysRewrite});
-
-    // TODO(inteltf): we do not support ReluGrad and BiasAddGrad yet.
+                      GetMklOpName(csinfo_.max_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.relu,
+                      GetMklOpName(csinfo_.relu),
+                      CopyAttrsRelu, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.relu_grad,
+                      GetMklOpName(csinfo_.relu_grad),
+                      CopyAttrsRelu, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.reshape,
+                      GetMklOpName(csinfo_.reshape),
+                      CopyAttrsReshape, AlwaysRewrite, nullptr});
 
     // Add info about which ops to add workspace edge to and the slots.
     wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
@@ -338,8 +364,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // maxhops in backward data-flow graph. Since input of forward nodes
     // (Conv2D) directly goes to backward nodes, we do not expect the
     // hop-distance would be more than few nodes.
-    cinfo_.push_back({csinfo_.bias_add_grad, csinfo_.mkl_conv2d_with_bias,
-                      kNodeMergeContextMaxDepth});
+    biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
+                                   kNodeMergeContextMaxDepth};
+
+    biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
+                                   csinfo_.mkl_conv2d_with_bias,
+                                   kNodeMergeContextMaxDepth};
+
+    cinfo_.push_back(&biasaddgrad_matmul_context_);
+    cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
   }
 
   // Standard interface to run pass
@@ -354,7 +387,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @return true, if and only if graph is mutated; false otherwise.
   bool RunPass(std::unique_ptr<Graph>* g);
 
- private:
+  /// Structure to specify the context information used in a node rewrite rule
+  typedef struct {
+    string node;     // Name of the node to be rewritten
+    string fwd;      // Name of the node in the forward pass that this node
+                     // corresponds to
+    size_t max_hop;  // Maximum number of hops the fwd is located
+                     // from this node. If the fwd is farther than max_hop
+                     // then we do not rewrite the node.
+  } ContextInfo;
+
   /// Structure to specify the name of an original node, its new name after
   /// rewrite, the number of inputs to the original node, the function to
   /// be used to copy attributes for the op, and the rule (if any) which
@@ -362,11 +404,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   typedef struct {
     string name;      // Original name of op of the node in the graph
     string new_name;  // New name of the op of the node in the graph
-    int num_ins;      // The number of inputs to the original op type
     // A function handler to copy attributes from an old node to a new node.
     std::function<void(const Node*, NodeBuilder*)> copy_attrs;
-    std::function<bool(const Node*)> rewrite_rule;  // A rule under which to
-                                                    // rewrite this node.
+    // A rule under which to rewrite this node
+    std::function<bool(const Node*, const ContextInfo* c)> rewrite_rule;
+    // ContextInfo, if any, to be used for rewrite
+    ContextInfo* context;
   } RewriteInfo;
 
   /// Structure to specify a forward op, a backward op, and the slot numbers
@@ -393,16 +436,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string new_node;  // Name of the node after merge
   } MergeInfo;
 
-  /// Structure to specify the context information used in a node rewrite rule
-  typedef struct {
-    string node;     // Name of the node to be rewritten
-    string fwd;      // Name of the node in the forward pass that this node
-                     // corresponds to
-    size_t max_hop;  // Maximum number of hops the fwd is located
-                     // from this node. If the fwd is farther than max_hop
-                     // then we do not rewrite the node.
-  } ContextInfo;
-
   /// Structure to store all constant strings
   /// NOTE: names are alphabetically sorted.
   struct {
@@ -417,6 +450,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string conv2d_grad_filter;
     string fused_batch_norm;
     string fused_batch_norm_grad;
+    string identity;
     string lrn;
     string lrn_grad;
     string matmul;
@@ -427,10 +461,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_with_bias_backprop_bias;
     string relu;
     string relu_grad;
-    string split;
     string reshape;
+    string split;
   } csinfo_;
 
+ private:
   /// Maintain info about nodes to rewrite
   std::vector<RewriteInfo> rinfo_;
 
@@ -441,7 +476,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   std::vector<MergeInfo> minfo_;
 
   /// Maintain info about nodes to rewrite
-  static std::vector<ContextInfo> cinfo_;
+  static std::vector<ContextInfo*> cinfo_;
+
+  /// Context variables used in referencing rules
+  static ContextInfo biasaddgrad_matmul_context_;
+  static ContextInfo biasaddgrad_conv2dwithbias_context_;
 
   /// Hash table to maintain nodes visited in the graph.
   std::unordered_set<const Node*> visited_nodes_;
@@ -464,19 +503,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // Clear all visited nodes
   inline void UnMarkRewrittenNodes() { visited_nodes_.clear(); }
 
-  // Is this a graph node that can accept variable number of inputs?
-  // Return true if yes, false otherwise.
-  //
-  // Concat, Split are vararg nodes.
-  inline bool IsVarArgNode(Node* n) {
-    if (n->type_string() == csinfo_.concat ||
-        n->type_string() == csinfo_.concatv2 ||
-        n->type_string() == csinfo_.split) {
-      return true;
-    }
-    return false;
-  }
-
   // Is OpDef::ArgDef a list type? It could be N * T or list(type).
   // Refer to opdef.proto for details of list type.
   inline bool ArgIsList(const OpDef::ArgDef& arg) const {
@@ -510,6 +536,39 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return string(kMklOpPrefix) + name;
   }
 
+  // Can op represented by node 'n' run on DEVICE_CPU?
+  // Op can run on CPU with MKL if the runtime assigned device or the
+  // user requested device contains device CPU, or both are empty.
+  bool CanOpRunOnCPUDevice(const Node* n) {
+    bool result = true;
+    string reason;
+
+    // Substring that should be checked for in device name for CPU device.
+    const char* const kCPUDeviceSubStr = "cpu";
+
+    // If Op has been specifically assigned to a non-CPU device, then No.
+    if (!n->assigned_device_name().empty() &&
+        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "Op has been assigned a runtime device that is not CPU.";
+    }
+
+    // If user has specifically assigned this op to a non-CPU device, then No.
+    if (!n->def().device().empty() &&
+        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "User has assigned a device that is not CPU.";
+    }
+
+    if (result == false) {
+      VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
+              << n->type_string() << ", reason: " << reason;
+    }
+
+    // Otherwise Yes.
+    return result;
+  }
+
   // Return a node that can be merged with input node 'n'
   //
   // @return pointer to the node if we can find such a
@@ -538,13 +597,46 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   // Default rewrite rule to be used in scenario 1 for rewrite.
   // @return - true (since we want to always rewrite)
-  static bool AlwaysRewrite(const Node* n) { return true; }
-  // Rewrite rule that uses context-information for matching
+  static bool AlwaysRewrite(const Node* n, const ContextInfo* c = nullptr) {
+    return true;
+  }
+
+  // Check if we are performing pooling on depth or batch. If it is, then we
+  // do not rewrite MaxPool node to Mkl version.
+  // @return - true (if it is not a depth/batch wise pooling case);
+  //           false otherwise.
+  static bool NonDepthBatchWisePoolRewrite(const Node* n,
+                                           const ContextInfo* c) {
+    CHECK_NOTNULL(n);
+
+    string data_format_str;
+    TensorFormat data_format;
+    std::vector<int32> ksize, strides;
+    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
+             true);
+    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+        GetTensorDim(strides, data_format, 'N') == 1 &&
+        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(strides, data_format, 'C') == 1) {
+      return true;
+    }
+
+    return false;
+  }
+
+  // Rewrite rule that uses context-information for matching,
   // used in scenario 2.
   //
   // @input - Node 'n' for which to search for matching context
-  // @return - true if matching context is found; false otherwise.
-  static bool ContextMatchRewrite(const Node* n);
+  // @input - The context 'c' under which to rewrite
+  // @return - true if we can rewrite node under context 'c';
+  //           false otherwise.
+  static bool ContextMatchRewrite(const Node* n, const ContextInfo* c);
 
   // Helper function that searches the matching contextinfo for the node.
   // Implements depth-first search in the data dependence graph for the
@@ -598,6 +690,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // node that we are constructing.
   //
   // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting
   // @input inputs - inputs to old node that we are using for constructing
   //                 new inputs,
   // @input input_idx - the index in the 'inputs' vector pointing to the
@@ -608,11 +701,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @output output_nodes - the list of new nodes creating Mkl tensors
   //
   // @return None
-  void GetNodesProducingMklTensorList(
-      std::unique_ptr<Graph>* g,
-      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-      int* input_idx, int list_length,
-      std::vector<NodeBuilder::NodeOut>* output_nodes);
+  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
+    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes);
 
   // Get a node that will feed an Mkl tensor to the new
   // node that we are constructing. The output node could be (1) 'n'
@@ -620,6 +712,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // if 'n' is not an Mkl layer.
   //
   // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting,
   // @input n - Node based on which we are creating Mkl node,
   // @input n_output_slot - the output slot of node 'n'
   //            which is feeding to the node that we are constructing
@@ -627,9 +720,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @output mkl_node_output_slot - the slot number of mkl_node that
   //                                will feed the tensor
   // @return None
-  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* n,
-                                 int n_output_slot, Node** mkl_node,
-                                 int* mkl_node_output_slot);
+  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
+    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
 
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
   // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
@@ -680,6 +772,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsIdentity(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsRelu(const Node* orig_node, NodeBuilder* nb);
@@ -695,13 +788,18 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                    Node* orig_node);
 };
 
-std::vector<MklLayoutRewritePass::ContextInfo> MklLayoutRewritePass::cinfo_;
+MklLayoutRewritePass::ContextInfo
+  MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
+MklLayoutRewritePass::ContextInfo
+  MklLayoutRewritePass::biasaddgrad_matmul_context_;
+std::vector<MklLayoutRewritePass::ContextInfo*> MklLayoutRewritePass::cinfo_;
 
-// We register Mkl rewrite pass for phase 1 in post rewrite group.
+// We register Mkl rewrite pass for phase 1 in post partitioning group.
 // We register it here so that we get a complete picture of all users of Mkl
 // nodes. Do not change the ordering of the Mkl passes.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 1,
-                      MklLayoutRewritePass);
+const OptimizationPassRegistry::Grouping kMklLayoutRewritePassGroup =
+    OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklLayoutRewritePassGroup, 1, MklLayoutRewritePass);
 
 //////////////////////////////////////////////////////////////////////////
 //           Helper functions for creating new node
@@ -737,27 +835,14 @@ void MklLayoutRewritePass::GetNodesProducingTFTensorList(
 
   while (list_length != 0) {
     CHECK_GT(list_length, 0);
-    CHECK_LE(*input_idx, inputs.size());
+    CHECK_LT(*input_idx, inputs.size());
     Node* n = inputs[*input_idx].first;
     int slot = inputs[*input_idx].second;
-    const OpDef::ArgDef& arg = n->op_def().output_arg(slot);
-    // If input node 'n' is producing a list/array output at output
-    // slot 'slot' then we need to find out the length of that list/array.
-    if (ArgIsList(arg)) {
-      int N = GetTensorListLength(arg, n);
-      CHECK_LE(N, list_length);
-      for (int j = 0; j < N; j++) {
-        output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
-      }
-      (*input_idx)++;
-      list_length -= N;
-    } else {
-      // But if input node 'n' is just producing a single tensor at
-      // output slot 'slot' then we just add that single node.
-      output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
-      (*input_idx)++;
-      list_length--;
-    }
+    // If input node 'n' is just producing a single tensor at
+    // output slot 'slot' then we just add that single node.
+    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
+    (*input_idx)++;
+    list_length--;
   }
 }
 
@@ -775,20 +860,39 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                  .Attr("value", proto)
-                  .Attr("dtype", dt)
-                  .Device(orig_node->def().device())  // We place this node on
-                                                      // the same device as the
-                                                      // device of the original
-                                                      // node.
-                  .Finalize(&**g, out));
+               .Attr("value", proto)
+               .Attr("dtype", dt)
+               .Device(orig_node->def().device())  // We place this node on
+                                                   // the same device as the
+                                                   // device of the original
+                                                   // node.
+               .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
   (*out)->set_assigned_device_name(orig_node->assigned_device_name());
 }
 
 void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     std::unique_ptr<Graph>* g,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
-    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
+    Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
   CHECK_NOTNULL(output_nodes);
@@ -796,38 +900,19 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 
   while (list_length != 0) {
     CHECK_GT(list_length, 0);
-    CHECK_LE(*input_idx, inputs.size());
+    CHECK_LT(*input_idx, inputs.size());
     Node* n = inputs[*input_idx].first;
     int slot = inputs[*input_idx].second;
-    const OpDef::ArgDef& arg = n->op_def().output_arg(slot);
-    // We need to check first if the input edge is going to carry a
-    // single tensor or a list of tensors. If it is a list of tensors,
-    // then we need to create list of Mkl dummy nodes.
-    if (ArgIsList(arg)) {
-      // If input node 'n' is producing a list/array output at output
-      // slot 'slot' then we need to find out the length of that list/array.
-      int N = GetTensorListLength(arg, n);
-      CHECK_LE(N, list_length);
-      Node* mkl_node = nullptr;
-      int mkl_node_output_slot = 0;
-      // If it is a list, then create a list of Mkl dummy nodes.
-      for (int j = 0; j < N; j++) {
-        GetNodeProducingMklTensor(g, n, slot, &mkl_node, &mkl_node_output_slot);
-        output_nodes->push_back(
-            NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
-      }
-      (*input_idx)++;
-      list_length -= N;
-    } else {
-      // If it is not a list, then create a single Mkl tensor node.
-      Node* mkl_node = nullptr;
-      int mkl_node_output_slot = 0;
-      GetNodeProducingMklTensor(g, n, slot, &mkl_node, &mkl_node_output_slot);
-      output_nodes->push_back(
-          NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
-      (*input_idx)++;
-      list_length--;
-    }
+    // If 'n' is producing a single tensor, then create a single Mkl tensor
+    // node.
+    Node* mkl_node = nullptr;
+    int mkl_node_output_slot = 0;
+    GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
+                              &mkl_node_output_slot);
+    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
+                                                mkl_node_output_slot));
+    (*input_idx)++;
+    list_length--;
   }
 }
 
@@ -835,9 +920,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 // node that we are constructing. An input node could be (1) 'n'
 // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
 // if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(
-    std::unique_ptr<Graph>* g, Node* n, int n_output_slot, Node** mkl_node,
-    int* mkl_node_output_slot) {
+void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
+    Node* orig_node, Node* n,
+    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
   CHECK_NOTNULL(n);
   CHECK_NOTNULL(mkl_node);
   CHECK_NOTNULL(mkl_node_output_slot);
@@ -860,7 +945,7 @@ void MklLayoutRewritePass::GetNodeProducingMklTensor(
     // to create a dummy node that will feed a dummy Mkl tensor to this node.
     // DummyMklTensor node has no input and generates only 1 output
     // (dummy Mkl tensor) as output slot number 0.
-    GetDummyMklTensorNode(g, mkl_node, n);
+    GetDummyMklTensorNode(g, mkl_node, orig_node);
     CHECK_NOTNULL(*mkl_node);
     *mkl_node_output_slot = 0;
   }
@@ -926,16 +1011,16 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node_inputs, &iidx, N,
-                                     &new_node_inputs);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                     N, &new_node_inputs);
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
       Node* mkl_node = nullptr;
       int mkl_node_output_slot = 0;
-      GetNodeProducingMklTensor(g, old_node_inputs[iidx].first,
-                                old_node_inputs[iidx].second, &mkl_node,
-                                &mkl_node_output_slot);
+      GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
+                                old_node_inputs[iidx].second,
+                                &mkl_node, &mkl_node_output_slot);
       nb->Input(mkl_node, mkl_node_output_slot);
       iidx++;
       nn_slot_idx++;
@@ -1020,13 +1105,30 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                  .Attr("value", proto)
-                  .Attr("dtype", dt)
-                  .Device(orig_node->def().device())  // We place this node on
-                                                      // same the device as the
-                                                      // device of the original
-                                                      // node.
-                  .Finalize(&**g, out));
+                .Attr("value", proto)
+                .Attr("dtype", dt)
+                .Device(orig_node->def().device())  // We place this node on
+                                                    // same the device as the
+                                                    // device of the original
+                                                    // node.
+                .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
   (*out)->set_assigned_device_name(orig_node->assigned_device_name());
 }
 
@@ -1179,6 +1281,16 @@ void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
   nb->Attr("data_format", data_format);
 }
 
+void MklLayoutRewritePass::CopyAttrsIdentity(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
 void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
                                         NodeBuilder* nb) {
   DataType T;
@@ -1235,6 +1347,19 @@ void MklLayoutRewritePass::CopyAttrsRelu(const Node* orig_node,
   nb->Attr("T", T);
 }
 
+void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  DataType Tshape;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("Tshape", Tshape);
+}
+
 void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
                                           NodeBuilder* nb) {
   DataType T;
@@ -1303,20 +1428,6 @@ void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
   nb->Attr("is_training", is_training);
 }
 
-void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                            NodeBuilder* nb) {
-  DataType T;
-  DataType Tshape;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("Tshape", Tshape);
-}
-
 //////////////////////////////////////////////////////////////////////////
 //           Helper functions related to node merge pass
 //////////////////////////////////////////////////////////////////////////
@@ -1353,8 +1464,9 @@ Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
       continue;
     }
 
+    const int B_in = b->num_inputs();
     gtl::InlinedVector<Node*, 4> b_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(N_in);
+    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(B_in);
     FillInputs(b, &b_control_edges, &b_in);
 
     // Shouldn't merge if a and b have different control edges.
@@ -1438,7 +1550,7 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
     CHECK_EQ(succ->in_edges().size(), 2);
     Node* oper3_mkl = nullptr;  // Mkl tensor corresponding to oper3
     int oper3_mkl_slot = 0;     // For dummy MKL tensor node, output slot is 0.
-    GetDummyMklTensorNode(g, &oper3_mkl, succ);  // Get dummy Mkl tensor node
+    GetDummyMklTensorNode(g, &oper3_mkl, pred);  // Get dummy Mkl tensor node
     // as BiasAdd does not have Mkl tensor as input.
     CHECK_NOTNULL(oper3_mkl);
 
@@ -1483,9 +1595,38 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
     // Set the Mkl layer label for this op.
     new_node->AddAttr("_kernel", mkl_op_registry::kMklOpLabel);
 
+    // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+    // node are already copied in BuildNode. We handle control edges now.
+    for (const Edge* e : pred->in_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      }
+    }
+    for (const Edge* e : succ->in_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      }
+    }
+
     // Incoming edges are fixed, we will fix the outgoing edges now.
+    // First, we will fix outgoing control edges from 'pred' node.
+    // We don't need to handle outgoing data edges from 'pred' node
+    // because pred has only 1 output going to succ node (we enforced
+    // this check for merge already).
+    for (const Edge* e : pred->out_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      }
+    }
+
+    // Second, we will fix outgoing control and data edges from 'succ' node.
     for (const Edge* e : succ->out_edges()) {
-      (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input());
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      } else {
+        CHECK_NOTNULL((*g)->AddEdge(new_node, e->src_output(), e->dst(),
+                                  e->dst_input()));
+      }
     }
 
     // Copy device assigned to old node to new node.
@@ -1550,18 +1691,22 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
             "data_format or T attribute or devices of BiasAddGrad and "
             "Conv2D do not match. Will skip node rewrite optimization");
       }
+    } else if (orig_node->type_string() == csinfo_.bias_add_grad &&
+               ri->new_name == csinfo_.matmul) {
+      // When BiasAddGrad has MatMul in context, we do not do any rewrite
+      // and leave BiasAddGrad as it is. But we check for this condition
+      // when we check for node rewrite rule. So we should not even come
+      // here for MatMul. So we will fail now.
+        return Status(
+            error::Code::INVALID_ARGUMENT,
+            "No rewrite is required for BiasAddGrad for MatMul context.");
     }
   }
 
   // Get all inputs.
-  const int num = orig_node->in_edges().size();
-  // Check the number of inputs against the user-specified value for non-vararg
-  // nodes.
-  if (!IsVarArgNode(orig_node)) {
-    CHECK_EQ(num, ri->num_ins);
-  }
+  const int num_inputs = orig_node->in_edges().size();
   gtl::InlinedVector<Node*, 4> control_edges;
-  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num);
+  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
   FillInputs(orig_node, &control_edges, &inputs);
 
   // Build new node. We use same name as original node, but change the op name.
@@ -1596,8 +1741,15 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   TF_CHECK_OK(nb.Finalize(&**g, &new_node));
   CHECK_NOTNULL(new_node);
 
-  // Incoming edges from 'orig_node' node to new 'new_node' node are already
-  // copied in BuildNode. Copy outgoing edges from 'orig_node' node to new
+  // Incoming data edges from 'orig_node' node to new 'new_node' node are
+  // already copied in BuildNode. We need to handle control edges now.
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Copy outgoing edges from 'orig_node' node to new
   // 'new_node' node, since the output also follows same ordering among
   // Tensorflow tensors and Mkl tensors. We need to connect Tensorflow
   // tensors appropriately. Specifically, nth output of the original node
@@ -1605,15 +1757,12 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   // of the tensors. For the contiguous ordering of the tensors, it will be n.
   // GetTensorDataIndex provides this mapping function.
   for (const Edge* e : orig_node->out_edges()) {
-    // We need to handle control-edges by using their original slot number.
-    // Generally, -1 is reserved for control slot.
-    if (e->src_output() < 0) {
-      (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input());
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
     } else {
-      (*g)->AddEdge(
-          new_node,
-          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
-          e->dst(), e->dst_input());
+      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
+                            e->src()->num_outputs()),
+                    e->dst(), e->dst_input()));
     }
   }
 
@@ -1640,8 +1789,8 @@ MklLayoutRewritePass::SearchMatchingContext(const Node* n,
   bool is_matching_cinfo_found = false;
   std::vector<const ContextInfo*> mci;
   for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
-    if (n->type_string() == ci->node) {
-      mci.push_back(&*ci);
+    if (n->type_string() == (*ci)->node) {
+      mci.push_back(*ci);
       is_matching_cinfo_found = true;
     }
   }
@@ -1701,9 +1850,10 @@ MklLayoutRewritePass::SearchMatchingContext(const Node* n,
   return nullptr;
 }
 
-bool MklLayoutRewritePass::ContextMatchRewrite(const Node* n) {
+bool MklLayoutRewritePass::ContextMatchRewrite(const Node* n,
+                                               const ContextInfo* c) {
   const Node* fwd_node = nullptr;
-  return SearchMatchingContext(n, &fwd_node) != nullptr;
+  return SearchMatchingContext(n, &fwd_node) == c;
 }
 
 const MklLayoutRewritePass::RewriteInfo*
@@ -1719,18 +1869,29 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
     return nullptr;
   }
 
-  if (!mkl_op_registry::IsMklOp(GetMklOpName(n->type_string()), T)) {
-    return nullptr;
+  // BiasAddGrad is not an Mkl layer, so we make an exception for it.
+  if (n->type_string() != csinfo_.bias_add_grad) {
+    if (!mkl_op_registry::IsMklOp(GetMklOpName(n->type_string()), T)) {
+      return nullptr;
+    }
   }
 
   // We support 2 types of node rewrites:
-  // 1. Rewriting BiasAddGrad depending on its context.
+  // 1. Rewriting BiasAddGrad depending on its MklConv2DWithBias context.
   // 2. Rewriting an op to Mkl op always
   // We return true if any of these 2 conditions is met.
 
   // Find matching RewriteInfo and then check that rewrite rule applies.
   for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-    if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) {
+    if (n->type_string().compare(ri->name) == 0 &&
+        ri->rewrite_rule(n, ri->context)) {
+      // If we are rewriting BiasAddGrad into BiasAddGrad for MatMul context,
+      // then we just return directly.
+      if (n->type_string() == csinfo_.bias_add_grad &&
+          ri->context->fwd == csinfo_.matmul &&
+          ri->new_name == csinfo_.bias_add_grad) {
+        return nullptr;
+      }
       return &*ri;
     }
   }
@@ -1753,7 +1914,8 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
 
   for (Node* n : order) {
-    if (!n->IsOp()) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
       continue;
     }
 
@@ -1801,18 +1963,31 @@ bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
 
-Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
-  if (options.graph == nullptr) {
+Status MklLayoutRewritePass::Run(
+  const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
 
-  // Get the ownership of graph
-  std::unique_ptr<Graph>* g = std::move(options.graph);
-
-  RunPass(g);
-
-  // Return the ownership of graph back
-  options.graph->reset(g->release());
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of a graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of a graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklLayoutRewritePassGroup !=
+      OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, a graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 6e72baf84e2f931af6688f66e424f17b0e1eb251..3c4a5263afd3817907ede7f14c9b433de5fce83c 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -39,7 +39,11 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static void InitGraph(const string& s, Graph* graph) {
+const char kCPUDevice[] = "/job:a/replica:0/task:0/cpu:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/gpu:0";
+
+static void InitGraph(const string& s, Graph* graph,
+                      const string& device = kCPUDevice) {
   GraphDef graph_def;
 
   auto parser = protobuf::TextFormat::Parser();
@@ -47,14 +51,18 @@ static void InitGraph(const string& s, Graph* graph) {
   CHECK(parser.MergeFromString(s, &graph_def)) << s;
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+
+  for (Node* node : graph->nodes()) {
+    node->set_assigned_device_name(device);
+  }
 }
 
 class MklLayoutPassTest : public ::testing::Test {
  public:
   MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
 
-  void InitGraph(const string& s) {
-    ::tensorflow::InitGraph(s, &graph_);
+  void InitGraph(const string& s, const string& device = kCPUDevice) {
+    ::tensorflow::InitGraph(s, &graph_, device);
     original_ = CanonicalGraphString(&graph_);
   }
 
@@ -114,7 +122,8 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2").Output("o: uint8").Output("o1: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2").Output("o: uint8")
+                        .Output("o1: uint8").SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -162,8 +171,9 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
       " input: ['E', 'Y']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
-            "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->E;B->E:1;D->E:2;"
-            "DMT/_0->E:5;E->Z;M->E:3;N->E:4;Y->Z:1");
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->E;"
+            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;M->E:3;"
+            "N->E:4;Y->Z:1");
 }
 
 // C=_MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Sub(E,Y) (for interleaved)
@@ -194,8 +204,9 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
       " input: ['E', 'Y']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
-            "M(_MklInput2);N(_MklInput2);Y(Input);Z(Sub)|A->E;B->E:1;D->E:2;"
-            "DMT/_0->E:5;E->Z;M:1->E:3;N:1->E:4;Y->Z:1");
+            "M(_MklInput2);N(_MklInput2);Y(Input);Z(Sub)|A->E;"
+            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;"
+            "M:1->E:3;N:1->E:4;Y->Z:1");
 }
 
 // C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
@@ -226,8 +237,9 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
             "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Sub)|"
-            "A->E;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;DMT/_2->E:5;"
-            "E->Z;Y->Z:1");
+            "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
 }
 
 // Graph contains only _MklConv2D, no AddBias.
@@ -330,9 +342,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
             "N(_MklInput)|A->C;B->C:1;C->E;D->E:1;M->C:2;N->C:3");
 }
 
-// Disabling Conv2DBackpropBias test for now as we have disabled rewrite
-// of BiasAddGrad into BackpropBias
-#if 0
 // Test set 2: _MklConv2D..BiasAddGrad -> _MklConv2DWithBiasBackpropBias
 // rewrite tests
 
@@ -361,18 +370,17 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
       " input: ['E'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
-            "E(Sub);F(_MklConv2DWithBiasBackpropBias);M(_MklInput);N(_MklInput);"
-            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;DMT/_0->F:1;E->F;"
-            "M->D:3;N->D:4;O->D:5");
+            "E(Sub);F(_MklConv2DWithBiasBackpropBias);M(_MklInput);"
+            "N(_MklInput);O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;"
+            "DMT/_0->F:1;E->F;E:control->DMT/_0:control;M->D:3;N->D:4;"
+            "O->D:5");
 }
-#endif
 
-// No _MklConv2D in context, but Conv2D in context.
-// Only Conv2D would be rewritten to _MklConv2D, but no rewrite
-// for BiasAddGrad should happen.
+// No _MklConv2DWithBias in context, but _MklConv2D in context.
+// No rewrite for BiasAddGrad should happen.
 // C=_MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D) (for interleaved)
 // C=_MklConv2D(A,B,M,N); D=Sub(C,A); E=BiasAddGrad(D) (for contiguous)
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_No_MklConv2DWithBias) {
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
@@ -507,8 +515,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
       "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Mul);DMT/_0(Const);DMT/_1(Const)|"
-            "A->C;B->C:1;B->D;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+            "A(Input);B(Input);C(_MklConv2D);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
 }
 
 // 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
@@ -535,7 +545,9 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       " input: ['C', 'D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;B->C:1;C->D:1;C->E;"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
             "C:1->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
 }
 
@@ -558,6 +570,50 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
             "A->C;B->C:1;B->D;C->D:1");
 }
 
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
 // Concat Op test: Concat with no Mkl layer feeding it
 TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
   InitGraph(
@@ -572,13 +628,14 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
       "node { name: 'D' op: 'Concat'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'N'                value { i: 2 } }"
-      " input: ['A', 'B']}"
+      " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D;B->D:1;B->D:2;C->E;"
-            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
 // Concat with 2 Mkl layers feeding it
@@ -616,9 +673,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
             "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
-            "F(_MklConv2D);G(Const);H(_MklConcat);I(Mul)|A->E;A->I;B->E:1;C->F;"
+            "F(_MklConv2D);G(Const);H(_MklConcat);I(Mul)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
             "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
-            "DMT/_4->H:3;E->H:1;E:1->H:4;F->H:2;F:1->H:5;G->H;H->I:1");
+            "DMT/_4->H:3;E->H:1;E:1->H:4;F->H:2;F:1->H:5;G->H;"
+            "G:control->DMT/_4:control;H->I:1");
 }
 
 // Concat with 1 Mkl and 1 non-Mkl layer feeding it
@@ -651,12 +711,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
             "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
-            "H(_MklConcat);I(Mul)|A->E;A->I;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "H(_MklConcat);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
             "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:1->H:4;F->H:2;"
-            "G->H;H->I:1");
+            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
 }
 
-#if 0
 // ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
 TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
   InitGraph(
@@ -676,11 +736,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
       "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D:2;B->D;B:1->D:1;C->E;"
-            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+            "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D:2;B->D;B:1->D:1;"
+            "B:control->DMT/_0:control;B:control->DMT/_1:control;"
+            "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
 }
-#endif
 
 // ConcatV2 with 2 Mkl layers feeding it
 TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
@@ -718,9 +779,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
             "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
-            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Mul)|A->E;A->I;B->E:1;C->F;"
+            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Mul)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
             "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
-            "DMT/_4->H:5;E->H;E:1->H:3;F->H:1;F:1->H:4;G->H:2;H->I:1");
+            "DMT/_4->H:5;E->H;E:1->H:3;E:control->DMT/_4:control;F->H:1;"
+            "F:1->H:4;G->H:2;H->I:1");
 }
 
 // ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
@@ -754,11 +818,175 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
             "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
-            "H(_MklConcatV2);I(Mul)|A->E;A->I;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
-            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:1->H:3;F->H:1;"
+            "H(_MklConcatV2);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:1->H:3;"
+            "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
             "G->H:2;H->I:1");
 }
 
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(Mul);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklReluGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+            "DMT/_1->C:2");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(Mul);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Int32Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'I' op: 'Int32Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['I', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
+            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
+            "I:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNormGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNormGrad);G(Mul)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNorm);G(Mul)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node for workspace edges
 /////////////////////////////////////////////////////////////////////
@@ -802,13 +1030,13 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       "node { name: 'H' op: 'Input'}"
       "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['H', 'G'] }");
-  EXPECT_EQ(
-      DoMklLayoutOptimizationPass(),
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
       "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
-      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);I(Mul)|"
-      "A->B;B->C;B->E;B->G:2;B:1->G:3;B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;"
-      "C->E:1;C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;"
-      "DMT/_2->G:5;E->G;E:1->G:4;F->G:1;G->I:1;H->I");
+      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
+      "I(Mul)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
+      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
+      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
+      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
 }
 
 /* Test LRN->LRNGrad replacement by workspace nodes. */
@@ -838,8 +1066,9 @@ TEST_F(MklLayoutPassTest, LRN_Positive) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
             "DMT/_2(Const);E(_MklLRNGrad);F(Mul)|"
-            "A->B;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;C->E;C->F;D->E:1;"
-            "DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
+            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
 }
 
 /* Test LRN->LRNGrad replacement when only one of them is present. */
@@ -858,7 +1087,7 @@ TEST_F(MklLayoutPassTest, LRN_Negative1) {
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(_MklLRN);C(Mul);DMT/_0(Const)|"
-            "A->B;A->C;B->C:1;DMT/_0->B:1");
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
 }
 
 /* Test LRN->LRNGrad replacement when only one of them is present. */
@@ -880,8 +1109,10 @@ TEST_F(MklLayoutPassTest, LRN_Negative2) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
             "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
-            "A->D;A->E;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;DMT/_1->D:7;"
-            "DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
 }
 
 /* Test LRN->LRNGrad negative case, where single LRN feeds
@@ -920,9 +1151,13 @@ TEST_F(MklLayoutPassTest, LRN_Negative3) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
             "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
-            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Mul)|A->B;B->E:2;"
-            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;D->E:1;"
-            "D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
+            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Mul)|A->B;"
+            "A:control->DMT/_0:control;B->E:2;"
+            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
+            "C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "C:control->DMT/_3:control;C:control->DMT/_4:control;"
+            "C:control->DMT/_5:control;C:control->DMT/_6:control;"
+            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
             "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
 }
 
@@ -951,8 +1186,9 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
             "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Mul)|"
-            "A->B;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;C->E;C->F;D->E:2;"
-            "DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
+            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
 }
 
 // Test MaxPool>MaxPoolGrad replacement when only one of them is present.
@@ -972,7 +1208,7 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(_MklMaxPool);C(Mul);DMT/_0(Const)|"
-            "A->B;A->C;B->C:1;DMT/_0->B:1");
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
 }
 
 // Test MaxPoolGrad replacement when only one of them is present.
@@ -995,8 +1231,374 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
             "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
-            "A->D;A->E;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;DMT/_1->D:7;"
-            "DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op on GPU device
+// No rewrite should happen
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Conv2D);D(Mul)|A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
+            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
+            "M->D:3;N->D:4;O->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Mul)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Relu);C(Mul)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(ReluGrad);D(Mul)|A->C;A->D;B->C:1;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(AvgPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(Concat);E(Mul)|A->D;"
+            "B->D:1;B:1->D:2;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Mul)|"
+            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);E(Input);"
+            "F(FusedBatchNorm);G(Mul)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->C;"
+            "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
 /////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 55c280719c34529e9af1cd2898e33fb3383c7499..590b3d030fa212ec4f510ef35fb7a425f2aa2f9e 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -98,12 +98,13 @@ class MklToTfConversionPass : public GraphOptimizationPass {
   Status InsertConversionNodeOnEdge(std::unique_ptr<Graph>* g, Edge*);
 };
 
-// We register MklToTf insertion for phase 1 in post-partition grouping.
-// We register this pass after partitioning so that we get a complete
-// picture of inputs and outputs of the nodes in the graphs.
+// We register MklToTf insertion for phase 2 in post-partition grouping
+// because we register MklLayoutRewritePass for phase 1 in post-partition
+// grouping. We register this pass after partitioning so that we get a
+// complete picture of inputs and outputs of the nodes in the graphs.
 const OptimizationPassRegistry::Grouping kMklTfConvPassGroup =
     OptimizationPassRegistry::POST_PARTITIONING;
-REGISTER_OPTIMIZATION(kMklTfConvPassGroup, 1, MklToTfConversionPass);
+REGISTER_OPTIMIZATION(kMklTfConvPassGroup, 2, MklToTfConversionPass);
 
 Status MklToTfConversionPass::InsertConversionNodeOnEdge(
     std::unique_ptr<Graph>* g, Edge* e) {
@@ -121,10 +122,12 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
   string data_format;
 
   TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
-  TF_CHECK_OK(GetNodeAttr(dst->def(), "T", &dst_datatype));
-  if (src_datatype != dst_datatype) {
-    string err_msg = "T attribute of " + src->name() + " and " + dst->name() +
-                     " do not match. Will not insert" +
+  bool dst_dtype_found = GetNodeAttr(dst->def(), "T", &dst_datatype) ==
+                          Status::OK();
+  // We compare source and destination datatypes only when both are found.
+  if (dst_dtype_found && (src_datatype != dst_datatype)) {
+    string err_msg = "T attribute of " + src->name() + " and " +
+                      dst->name() + " do not match. Will not insert" +
                      " MklToTf node in such case.";
     return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
   }
@@ -202,18 +205,19 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
             << src->type_string() << " and " << dst->type_string();
 
     // Let's get source and destination data type.
-    DataType src_datatype = DT_INVALID;
-    if (GetNodeAttr(src->def(), "T", &src_datatype) != Status::OK()) {
-      continue;
-    }
     // We cannot check datatype on destination node because destination node
     // may not be Mkl node.
-    DataType dst_datatype = DT_INVALID;
-    GetNodeAttr(dst->def(), "T", &dst_datatype);
+    DataType src_datatype;
+    DataType dst_datatype;
+    bool src_is_mkl_op = (GetNodeAttr(src->def(), "T", &src_datatype) ==
+                            Status::OK() &&
+                          IsMklSupportedOp(src->type_string(), src_datatype));
+    bool dst_is_mkl_op = (GetNodeAttr(dst->def(), "T", &dst_datatype) ==
+                            Status::OK() &&
+                          IsMklSupportedOp(dst->type_string(), dst_datatype));
 
     // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
-    if (IsMklSupportedOp(src->type_string(), src_datatype) &&
-        !IsMklSupportedOp(dst->type_string(), dst_datatype)) {
+    if (src_is_mkl_op && !dst_is_mkl_op) {
       VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name()
               << " and " << dst->name() << " for inserting conversion nodes";
       candidate_edges.push_back(const_cast<Edge*>(e));
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index bd2cb0989c1cd0dc576d3a443c6e7361e71ed665..90bef111648452f823a669cab3c063377ed7bdef 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -149,7 +149,7 @@ TEST_F(MklToTfConversionPass, Positive) {
         " input: ['C', 'D']}");
     EXPECT_EQ(DoRunMklToTfConversionPass(),
               "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);"
-              "_Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;"
+              "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;"
               "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
   } else {
     CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
@@ -172,7 +172,7 @@ TEST_F(MklToTfConversionPass, Positive) {
         " input: ['C', 'D']}");
     EXPECT_EQ(DoRunMklToTfConversionPass(),
               "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);"
-              "_Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:1;C->Mkl2Tf/_0;"
+              "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:1;C->Mkl2Tf/_0;"
               "C:1->Mkl2Tf/_0:1;D->E:1;M->C:2;Mkl2Tf/_0->E;N->C:3");
   }
 }
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index 59dff60ea3bafeb7f747b0e5424f448565e7bc85..a22a9b3fa31ff45fa7372e9270ac4ef8968b8f66 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/graph/optimizer_cse.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/graph/algorithm.h"
@@ -52,14 +53,12 @@ class OptimizerCSE {
  public:
   explicit OptimizerCSE(Graph* g) : g_(g) {}
 
-  bool Optimize(std::function<bool(const Node*)> consider_fn);
+  bool Optimize(const std::function<bool(const Node*)>& consider_fn);
 
  private:
-  struct Scratch;
-
   static size_t NodeHash(const Node* n);
-  static bool Equivalent(const Node* a, const Node* b, Scratch* s);
-  static bool EqualAttrs(const Node* a, const Node* b, Scratch* s);
+  static bool Equivalent(const Node* a, const Node* b,
+                         AttrSlice::Scratch* scratch);
 
   Graph* g_;
 };
@@ -109,7 +108,7 @@ size_t OptimizerCSE::NodeHash(const Node* n) {
   // Hash the attrs.  For example, this makes sure different constants
   // end up in different hash buckets.
   string tmp;
-  for (const auto& attr : n->def().attr()) {
+  for (const auto& attr : n->attrs()) {
     tmp = attr.first;
     attr.second.AppendToString(&tmp);
     // Add hashes of attrs, so the order of attrs doesn't matter.
@@ -121,28 +120,6 @@ size_t OptimizerCSE::NodeHash(const Node* n) {
   return h;
 }
 
-struct OptimizerCSE::Scratch {
-  // For EqualAttrs():
-  string a;
-  string b;
-};
-
-bool OptimizerCSE::EqualAttrs(const Node* a, const Node* b, Scratch* scratch) {
-  if (a->def().attr_size() != b->def().attr_size()) return false;
-
-  for (const auto& attr : b->def().attr()) {
-    auto iter = a->def().attr().find(attr.first);
-    if (iter == a->def().attr().end()) return false;
-    // Note: it should be safe to compare proto serializations of the attr
-    // values since at most one field should be set in each (indeed, it
-    // should be the same field).
-    iter->second.SerializeToString(&scratch->a);
-    attr.second.SerializeToString(&scratch->b);
-    if (scratch->a != scratch->b) return false;
-  }
-  return true;
-}
-
 static bool HasRefInput(const Node* n) {
   for (auto dt : n->input_types()) {
     if (IsRefType(dt)) return true;
@@ -150,7 +127,8 @@ static bool HasRefInput(const Node* n) {
   return false;
 }
 
-bool OptimizerCSE::Equivalent(const Node* a, const Node* b, Scratch* scratch) {
+bool OptimizerCSE::Equivalent(const Node* a, const Node* b,
+                              AttrSlice::Scratch* scratch) {
   // Different op names are different
   if (a->type_string() != b->type_string()) return false;
 
@@ -163,7 +141,7 @@ bool OptimizerCSE::Equivalent(const Node* a, const Node* b, Scratch* scratch) {
 
   // Compare attrs.  Note that equal attrs implies equal input and
   // output types.
-  if (!EqualAttrs(a, b, scratch)) return false;
+  if (!a->attrs().EqualAttrs(b->attrs(), scratch)) return false;
 
   // Compare input sources
   if (a->num_inputs() != b->num_inputs()) return false;
@@ -180,7 +158,8 @@ bool OptimizerCSE::Equivalent(const Node* a, const Node* b, Scratch* scratch) {
   return true;
 }
 
-bool OptimizerCSE::Optimize(std::function<bool(const Node*)> consider_fn) {
+bool OptimizerCSE::Optimize(
+    const std::function<bool(const Node*)>& consider_fn) {
   // This very simple implementation works if the whole graph is one
   // giant basic block (because we just traverse nodes in a
   // topological order). This simple implementation works well
@@ -204,7 +183,7 @@ bool OptimizerCSE::Optimize(std::function<bool(const Node*)> consider_fn) {
   // Scratch space for Equivalent calls.  Allocated here and passed in to
   // Equivalent to avoid allocation inside the loop below.
   bool changed = false;
-  Scratch scratch;
+  AttrSlice::Scratch scratch;
   for (Node* n : order) {
     if (!n->IsOp()) continue;
 
@@ -232,7 +211,8 @@ bool OptimizerCSE::Optimize(std::function<bool(const Node*)> consider_fn) {
   return changed;
 }
 
-bool OptimizeCSE(Graph* g, std::function<bool(const Node*)> consider_fn) {
+bool OptimizeCSE(Graph* g,
+                 const std::function<bool(const Node*)>& consider_fn) {
   OptimizerCSE opt(g);
   return opt.Optimize(consider_fn);
 }
diff --git a/tensorflow/core/graph/optimizer_cse.h b/tensorflow/core/graph/optimizer_cse.h
index 24ec5658d86dab147d60ba9095138b7585ec4cb6..b8f3230c70c314f15cc2179c98d727902ef1ab9d 100644
--- a/tensorflow/core/graph/optimizer_cse.h
+++ b/tensorflow/core/graph/optimizer_cse.h
@@ -29,7 +29,8 @@ namespace tensorflow {
 // during the common subexpression elimination.
 //
 // Returns true if and only if 'g' is mutated.
-extern bool OptimizeCSE(Graph* g, std::function<bool(const Node*)> consider_fn);
+extern bool OptimizeCSE(Graph* g,
+                        const std::function<bool(const Node*)>& consider_fn);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 1091af4e451d5d3481dc2ad2422483ac3b3791f8..94250240eb746a49be8f8a37e73b793e37e1832c 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/optimizer_cse.h"
 
+#include <utility>
 #include <vector>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index b241ab6ab32bc909337669807848914558982906..4a479d3258a962ab1c27dd001317971f7973dc31 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -192,9 +192,9 @@ Status ConnectVariablesToSaveOp(Graph* graph, Node* save_op,
   Tensor tensor_names;
   Tensor shape_and_slices;
   TF_RETURN_IF_ERROR(
-      GetNodeAttr(AttrSlice(tensor_names_op->def()), "value", &tensor_names));
-  TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(shape_and_slices_op->def()), "value",
-                                 &shape_and_slices));
+      GetNodeAttr(tensor_names_op->attrs(), "value", &tensor_names));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(shape_and_slices_op->attrs(), "value", &shape_and_slices));
 
   int tn_size = tensor_names.NumElements();
   int var_size = added_variables.size();
@@ -526,31 +526,42 @@ Status MakeInputMinMax(Graph* graph, const string& name_prefix,
   return Status::OK();
 }
 
-// Adds a QuantizeAndDequantizeV2Op (and required input nodes) based on edge.
+// Adds a QuantizeAndDequantizeV2 or FakeQuantizeWithMinMaxVars op
+// (and required input nodes) based on edge.
 // The result is stored in convert_node.
-Status MakeQuantizeAndDequantizeV2(Graph* graph, const string& name_prefix,
-                                   const EdgeToConvert& edge,
-                                   std::vector<Node*>* added_variables,
-                                   Node** convert_node) {
+Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
+                      const string& quant_op_type, const EdgeToConvert& edge,
+                      std::vector<Node*>* added_variables,
+                      Node** convert_node) {
   Node* input_min;
   Node* input_max;
   TF_RETURN_IF_ERROR(MakeInputMinMax(graph, name_prefix, edge, added_variables,
                                      &input_min, &input_max));
-
-  string quant_name = strings::StrCat(name_prefix, "/QuantizeAndDequantizeV2");
-  TF_RETURN_IF_ERROR(NodeBuilder(quant_name, "QuantizeAndDequantizeV2")
-                         .Input(edge.edge->src())
-                         .Input(input_min)
-                         .Input(input_max)
-                         .Attr("signed_input", edge.signed_input)
-                         .Attr("num_bits", edge.num_bits)
-                         .Attr("range_given", true)
-                         .Finalize(graph, convert_node));
+  string quant_name = strings::StrCat(name_prefix, "/", quant_op_type);
+  if (quant_op_type == "QuantizeAndDequantizeV2") {
+    TF_RETURN_IF_ERROR(NodeBuilder(quant_name, quant_op_type)
+                           .Input(edge.edge->src())
+                           .Input(input_min)
+                           .Input(input_max)
+                           .Attr("signed_input", edge.signed_input)
+                           .Attr("num_bits", edge.num_bits)
+                           .Attr("range_given", true)
+                           .Finalize(graph, convert_node));
+  } else if (quant_op_type == "FakeQuantWithMinMaxVars") {
+    TF_RETURN_IF_ERROR(NodeBuilder(quant_name, quant_op_type)
+                           .Input(edge.edge->src())
+                           .Input(input_min)
+                           .Input(input_max)
+                           .Attr("num_bits", edge.num_bits)
+                           .Finalize(graph, convert_node));
+  } else {
+    return errors::InvalidArgument("Unknown quant op type: ", quant_op_type);
+  }
   return Status::OK();
 }
 
 // Insert conversion op, connect it to the graph and remove the old edge.
-Status ProcessTargetEdges(Graph* graph,
+Status ProcessTargetEdges(Graph* graph, const string& quant_op_type,
                           const std::vector<EdgeToConvert>& target_edges) {
   // Remember previously converted ops to avoid duplicated conversion on the
   // same input.
@@ -562,8 +573,8 @@ Status ProcessTargetEdges(Graph* graph,
 
     auto iter = name_index.find(name_prefix);
     if (iter == name_index.end()) {
-      TF_RETURN_IF_ERROR(MakeQuantizeAndDequantizeV2(
-          graph, name_prefix, edge, &added_variables, &convert_node));
+      TF_RETURN_IF_ERROR(MakeQuantizeOp(graph, name_prefix, quant_op_type, edge,
+                                        &added_variables, &convert_node));
       name_index[name_prefix] = convert_node;
     } else {
       convert_node = iter->second;
@@ -580,7 +591,8 @@ Status ProcessTargetEdges(Graph* graph,
 
 }  // namespace
 
-Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
+Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
+                          Graph* graph) {
   if (graph == nullptr) {
     return errors::InvalidArgument("Cannot accept empty graph pointer.");
   }
@@ -638,13 +650,14 @@ Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
     }
   }
 
-  TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, target_edges));
+  TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, quant_op_type, target_edges));
 
   return Status::OK();
 }
 
 Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
                                               int32 num_bits,
+                                              const string& quant_op_type,
                                               string* result_graph) {
   // First create the graph from the GraphDef.
   Graph graph(OpRegistry::Global());
@@ -656,7 +669,7 @@ Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, input_graphdef, &graph));
 
   // Call the rewriter on the graph.
-  TF_RETURN_IF_ERROR(DoQuantizeTraining(num_bits, &graph));
+  TF_RETURN_IF_ERROR(DoQuantizeTraining(num_bits, quant_op_type, &graph));
 
   // Convert the result graph back to a GraphDef.
   GraphDef output_graphdef;
diff --git a/tensorflow/core/graph/quantize_training.h b/tensorflow/core/graph/quantize_training.h
index 66db0c5bf4b6f1546f38f23ebf26d79d83bbb3da..2c1a7e6ae3618904ef37b5ec0ed38c61c6180455 100644
--- a/tensorflow/core/graph/quantize_training.h
+++ b/tensorflow/core/graph/quantize_training.h
@@ -24,6 +24,10 @@ namespace tensorflow {
 // the model can learn to deal with such loss and achieve better accuracy when
 // it is quantized later for inference.
 // Note that the num_bits should be in [1, 63] and 'g' must be not null.
+// quant_op_type specifies which quantization op should be used.
+// Current ops supported:
+// - QuantizeAndDequantizeV2.
+// - FakeQuantWithMinMaxVars.
 //
 // On success, returns OK.
 //
@@ -31,12 +35,14 @@ namespace tensorflow {
 //    - num_bits out of range.
 //    - g is null.
 //    - More than 1 unknown ops encountered.
-Status DoQuantizeTraining(int32 num_bits, Graph* g);
+Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
+                          Graph* g);
 
 // Converts a input GraphDef and returns a rewritten GraphDef with the
 // quantized training.
 Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
                                               int32 num_bits,
+                                              const string& quant_op_type,
                                               string* result_graph);
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
index 867dd8161b02ab15b8551bcc2be336fea744c5cb..d817d980de90aad7df91eecbf92de50c3dd1b243 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -103,7 +103,7 @@ TEST_F(QuantizeTrainingTest, SignedInput) {
       a       b
   */
   const int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
 
   EXPECT_EQ(63, g->num_nodes());
 
@@ -112,17 +112,15 @@ TEST_F(QuantizeTrainingTest, SignedInput) {
   TF_ASSERT_OK(
       FindNode(g, strings::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
                &identity_q_node));
-  NodeDef identity_q = identity_q_node->def();
   ASSERT_EQ("true",
-            SummarizeAttrValue(identity_q.attr().find("signed_input")->second));
+            SummarizeAttrValue(*identity_q_node->attrs().Find("signed_input")));
   // Quantize_and_dequantize node for relu should have signed_input==false.
   Node* relu_q_node;
   TF_ASSERT_OK(
       FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
                &relu_q_node));
-  NodeDef relu_q = relu_q_node->def();
   ASSERT_EQ("false",
-            SummarizeAttrValue(relu_q.attr().find("signed_input")->second));
+            SummarizeAttrValue(*relu_q_node->attrs().Find("signed_input")));
 }
 
 TEST_F(QuantizeTrainingTest, RangeGivenTrue) {
@@ -156,7 +154,7 @@ TEST_F(QuantizeTrainingTest, RangeGivenTrue) {
       a       b
   */
   const int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
 
   EXPECT_EQ(38, g->num_nodes());
 
@@ -165,20 +163,18 @@ TEST_F(QuantizeTrainingTest, RangeGivenTrue) {
   TF_ASSERT_OK(
       FindNode(g, strings::StrCat(relu6->name(), "/QuantizeAndDequantizeV2"),
                &relu6_q_node));
-  NodeDef identity_q = relu6_q_node->def();
   ASSERT_EQ("true",
-            SummarizeAttrValue(identity_q.attr().find("range_given")->second));
+            SummarizeAttrValue(*relu6_q_node->attrs().Find("range_given")));
   // Quantize_and_dequantize node for relu should have range_given==true.
   Node* relu_q_node;
   TF_ASSERT_OK(
       FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
                &relu_q_node));
-  NodeDef relu_q = relu_q_node->def();
   ASSERT_EQ("true",
-            SummarizeAttrValue(relu_q.attr().find("range_given")->second));
+            SummarizeAttrValue(*relu_q_node->attrs().Find("range_given")));
 }
 
-TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
+TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
   // Construct a graph with an additional backward Matmul.
   Reset();
   Graph* g = g_.get();
@@ -211,11 +207,11 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
   g->AddControlEdge(backward_m, g->sink_node());
 
   int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
 
   EXPECT_EQ(95, g->num_nodes());
 
-  // Ensure that we the backwards matmul input was not quantized.
+  // Ensure that the backwards matmul input was not quantized.
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"),
                       &found_node);
@@ -232,6 +228,60 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
       g, strings::StrCat(c->name(), "/QuantizeAndDequantizeV2"), &found_node));
 }
 
+TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
+  // Construct a graph with an additional backward Matmul.
+  Reset();
+  Graph* g = g_.get();
+  Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  // We will use node d as input to the backwards matmul to ensure that it
+  // isn't quantized.
+  Node* d = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), b);
+  g->AddControlEdge(g->source_node(), c);
+  g->AddControlEdge(g->source_node(), d);
+  Node* relu = test::graph::Relu(g, a);
+  Node* identity = test::graph::Identity(g, b);
+  Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
+  Node* m2 = test::graph::Matmul(g, identity, c, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+  g->AddControlEdge(m2, g->sink_node());
+
+  // Add a Matmul node with name starting with "gradients". We will check that
+  // its input d was not quantized.
+  Node* backward_m;
+  TF_ASSERT_OK(NodeBuilder(g->NewName("gradients/n"), "MatMul")
+                   .Input(d)
+                   .Input(m2)
+                   .Attr("transpose_a", true)
+                   .Attr("transpose_b", false)
+                   .Finalize(g, &backward_m));
+  g->AddControlEdge(backward_m, g->sink_node());
+
+  int num_bits = 8;
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "FakeQuantWithMinMaxVars", g));
+
+  EXPECT_EQ(95, g->num_nodes());
+
+  // Ensure that the backwards matmul input was not quantized.
+  Node* found_node;
+  Status s = FindNode(g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"),
+                      &found_node);
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+
+  // Ensure that m1 and m2's inputs were quantized.
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(relu->name(), "/FakeQuantWithMinMaxVars"),
+               &found_node));
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(identity->name(), "/FakeQuantWithMinMaxVars"),
+               &found_node));
+  TF_ASSERT_OK(FindNode(
+      g, strings::StrCat(c->name(), "/FakeQuantWithMinMaxVars"), &found_node));
+}
+
 TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
   // Construct a simple graph with 5 nodes.
   Reset();
@@ -254,8 +304,8 @@ TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
   input_graph.SerializeToString(&input_string);
 
   string result_string;
-  TF_ASSERT_OK(DoQuantizeTrainingOnSerializedGraphDef(input_string, num_bits,
-                                                      &result_string));
+  TF_ASSERT_OK(DoQuantizeTrainingOnSerializedGraphDef(
+      input_string, num_bits, "QuantizeAndDequantizeV2", &result_string));
 
   GraphDef result_graphdef;
   EXPECT_TRUE(ParseProtoUnlimited(&result_graphdef, result_string));
@@ -265,11 +315,105 @@ TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
   GraphConstructorOptions opts;
   Graph result_graph(OpRegistry::Global());
   TF_ASSERT_OK(ConvertGraphDefToGraph(opts, result_graphdef, &result_graph));
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, graph));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", graph));
   EXPECT_EQ(graph->num_nodes(), result_graph.num_nodes());
 }
 
-TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange) {
+TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_QuantizeAndDequantize) {
+  // Construct the following graph
+  // Relu has an unknown range, so we will check if the EMA correctly estimates
+  // the range.
+  /*
+           m1
+        /      \
+      Relu    Relu6
+        |       |
+        a       c
+  */
+  Reset();
+  Graph* g = g_.get();
+  Node* a;
+  TF_ASSERT_OK(Placeholder(g, "a", {2, 2}, &a));
+  Node* c = Constant<float>({2.0, 3.0, 4.0, 5.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), c);
+  Node* relu = test::graph::Relu(g, a);
+  Node* relu6 = test::graph::Relu6(g, c);
+  Node* m1 = test::graph::Matmul(g, relu, relu6, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+
+  // This is rewritten into the following subgraph, where Q_a and Q_c are
+  // quantize and dequantize subgraphs.
+  // Since relu's range is unknown, we check that the exponential moving average
+  // works correctly.
+  /*
+         m1
+      /      \
+     Q_a     Q_c
+      |       |
+    Relu     Relu6
+      |       |
+      a       c
+  */
+  const int num_bits = 8;
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
+
+  SessionOptions options;
+  Session* sess;
+  TF_ASSERT_OK(NewSession(options, &sess));
+  GraphDef gdef;
+  g->ToGraphDef(&gdef);
+  TF_ASSERT_OK(sess->Create(gdef));
+
+  // The min and max values of the relu6 quantization should be constant values
+  // of 0 and 6.
+  string min_const_name = strings::StrCat(relu6->name(), "/InputMin");
+  string max_const_name = strings::StrCat(relu6->name(), "/InputMax");
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+
+  Tensor a1(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a1, {0.0, 1.0, 2.0, 3.0});
+  Tensor a2(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a2, {1.0, 2.0, 3.0, 4.0});
+
+  TF_ASSERT_OK(sess->Run({{"a", a1}}, {m1->name()}, {}, &outputs));
+
+  // The value of the min and max should be set to the min and max of a1 since
+  // this is the first run that initializes the EMA variables.
+  string min_var_name = strings::StrCat(relu->name(), "/Min/Variable");
+  string max_var_name = strings::StrCat(relu->name(), "/Max/Variable");
+  TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 3.0);
+
+  // The relu6 quantization range should remain unchanged.
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+
+  // Now when we run with new inputs, we should get a moving average for the min
+  // and max variables. They should be equal to:
+  // min_var = old_min_var * decay + min(a2) * (1 - decay)
+  // max_var = old_max_var * decay + max(a2) * (1 - decay)
+  TF_ASSERT_OK(sess->Run({{"a", a2}}, {m1->name()}, {}, &outputs));
+
+  TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
+  const float decay = 0.999;
+  const float expected_min = 0.0 * decay + 1.0 * (1.0 - decay);
+  const float expected_max = 3.0 * decay + 4.0 * (1.0 - decay);
+  EXPECT_NEAR(outputs[0].flat<float>()(0), expected_min, 1e-4);
+  EXPECT_NEAR(outputs[1].flat<float>()(0), expected_max, 1e-4);
+
+  // The relu6 quantization range should remain unchanged.
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+}
+
+TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_FakeQuant) {
   // Construct the following graph
   // Relu has an unknown range, so we will check if the EMA correctly estimates
   // the range.
@@ -306,7 +450,7 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange) {
       a       c
   */
   const int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "FakeQuantWithMinMaxVars", g));
 
   SessionOptions options;
   Session* sess;
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index 9849d9a15969bc7edd12a19d3d2b74abb8b0874e..e10b6928898820d8e97b72b7f969419c1b54efc1 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -106,7 +106,7 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
     // Copy the _output_shapes from the original node to the feed node,
     // if any.
     std::vector<PartialTensorShape> output_shapes;
-    if (GetNodeAttr(n->def(), "_output_shapes", &output_shapes).ok()) {
+    if (GetNodeAttr(n->attrs(), "_output_shapes", &output_shapes).ok()) {
       if (n->num_outputs() != output_shapes.size()) {
         return errors::InvalidArgument(
             "FeedInputs: ", t,
@@ -129,8 +129,8 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
       if (e->src_output() == id.second) {
         to_remove.emplace_back(e);
       } else if (e->src_output() == Graph::kControlSlot &&
-                 (n->def().op() == "Placeholder" ||
-                  n->def().op() == "PlaceholderV2")) {
+                 (n->type_string() == "Placeholder" ||
+                  n->type_string() == "PlaceholderV2")) {
         // When feeding a Placeholder node, any outgoing control edges
         // will be replaced with a control edge from the replacement
         // recv_node.
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index 3dc11b7a1662b88b9648626557a43c69f7b11ac8..93dcfd5e33867c1fc1ef98b3f939089b50f248b2 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -81,7 +81,7 @@ class SubgraphTest : public ::testing::Test {
     for (const string& s : expected_nodes) {
       Node* n = FindNode(s);
       EXPECT_TRUE(n != nullptr) << s;
-      if (n->def().op() == "_Send" || n->def().op() == "_Recv") {
+      if (n->type_string() == "_Send" || n->type_string() == "_Recv") {
         EXPECT_EQ(device_info_.name(), n->assigned_device_name()) << s;
       }
     }
@@ -367,7 +367,7 @@ TEST_F(SubgraphTest, FedOutputsPreservesOutputShapes) {
   for (Node* node : graph()->nodes()) {
     if (node->name() == "_recv_input_1") {
       std::vector<PartialTensorShape> shapes;
-      TF_ASSERT_OK(GetNodeAttr(node->def(), "_output_shapes", &shapes));
+      TF_ASSERT_OK(GetNodeAttr(node->attrs(), "_output_shapes", &shapes));
       ASSERT_EQ(1, shapes.size());
       EXPECT_TRUE(PartialTensorShape({23}).IsIdenticalTo(shapes[0]));
       break;
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index f0ab5520f11fbc1e7c7d948ef4f72703716a7519..c495b2181207a0520bab8f33cdc28cd723b1ef40 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -416,24 +416,6 @@ Node* Cast(Graph* g, Node* in, DataType dst) {
   return ret;
 }
 
-Node* BroadcastArgs(Graph* g, Node* s0, Node* s1) {
-  Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BroadcastArgs")
-                  .Input(s0)
-                  .Input(s1)
-                  .Finalize(g, &ret));
-  return ret;
-}
-
-Node* BroadcastGradientArgs(Graph* g, Node* s0, Node* s1) {
-  Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BroadcastGradientArgs")
-                  .Input(s0)
-                  .Input(s1)
-                  .Finalize(g, &ret));
-  return ret;
-}
-
 Node* Gather(Graph* g, Node* in0, Node* in1) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Gather")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index d508f65ada5bef3392d6e002833ca734f7fa1160..48250fef0fa44ee9fe25d7751c067d3c1257d4b7 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -174,12 +174,6 @@ Node* Cast(Graph* g, Node* in, DataType dst);
 // Perform gather op on params "in0" with indices "in1".
 Node* Gather(Graph* g, Node* in0, Node* in1);
 
-// Computes broadcasted shape from the given input shapes.
-Node* BroadcastArgs(Graph* g, Node* s0, Node* s1);
-
-// Computes the args needed broadcast gradient function.
-Node* BroadcastGradientArgs(Graph* g, Node* s0, Node* s1);
-
 // Gets a tensor stored in the session state.
 Node* GetSessionTensor(Graph* g, Node* in);
 
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 8836f2ffa7d797d60755a051757ca74227c1d7f4..57b7550ef05ff727f164d1e99c5b2fd6f7fd7d31 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -14,30 +14,6 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "mobile_srcs",
-    srcs = [
-        "devices.cc",
-        "devices.h",
-        "grappler_item.cc",
-        "grappler_item.h",
-        "op_types.cc",
-        "op_types.h",
-        "utils.cc",
-        "utils.h",
-        "//tensorflow/core/grappler/clusters:android_srcs",
-        "//tensorflow/core/grappler/inputs:android_srcs",
-        "//tensorflow/core/grappler/optimizers:android_srcs",
-    ],
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-alias(
-    name = "android_srcs",
-    actual = ":mobile_srcs",
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "op_types",
     srcs = ["op_types.cc"],
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 34ad4048562420c6a6ea12b56bbbca4d24880f5f..556eb1dbc5ceaaab390d91c805d5c7a7f1203b6a 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -12,20 +12,30 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "mobile_srcs",
-    srcs = glob(
-        [
-            "cluster.*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+config_setting(
+    name = "xsmm",
+    licenses = ["notice"],
+    values = {
+        "define": "tensorflow_xsmm=1",
+    },
 )
 
-alias(
-    name = "android_srcs",
-    actual = ":mobile_srcs",
-    visibility = ["//tensorflow:__subpackages__"],
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = [
+        "utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//third_party/eigen3",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ] + select({
+        ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
@@ -44,6 +54,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "virtual_cluster",
+    srcs = ["virtual_cluster.cc"],
+    hdrs = [
+        "virtual_cluster.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cluster",
+        ":utils",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "single_machine",
     srcs = ["single_machine.cc"],
@@ -53,11 +77,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cluster",
+        ":utils",
         "//tensorflow/cc:coordinator",
         "//tensorflow/cc:queue_runner",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
     ],
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index c93911c902e19b937e8c5080c2edfc9331d13b31..b2a326b3b0d03bd836bc23da2107301fedc1f5a1 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -56,5 +56,15 @@ void Cluster::DisableDetailedStats(bool disable) {
   }
 }
 
+const std::vector<string> Cluster::GetDeviceNames() const {
+  std::vector<string> device_names;
+  device_names.reserve(devices_.size());
+  for (const auto& device : devices_) {
+    device_names.push_back(device.first);
+  }
+  std::sort(device_names.begin(), device_names.end());
+  return device_names;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 45821db1ee8378d84a5a8aa40abf2195fa78e1b0..403ded9a6e47b7950fceeecc62e6b8b25dcc96f1 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -17,13 +17,14 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
 
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -62,18 +63,14 @@ class Cluster {
 
   // Return the list of TensorFlow devices that are available to execute a
   // graph. This is empty until provision() is called.
-  const std::vector<DeviceAttributes>& GetDevices() const { return devices_; }
-
-  // Convenience method that returns the set of device names.
-  const std::vector<string> GetDeviceNames() const {
-    std::vector<string> device_names;
-    device_names.reserve(devices_.size());
-    for (const auto& device : devices_) {
-      device_names.push_back(device.name());
-    }
-    return device_names;
+  const std::unordered_map<string, DeviceProperties>& GetDevices() const {
+    return devices_;
   }
 
+  // Convenience method that returns the set of device names. These names are
+  // sorted alphabetically.
+  const std::vector<string> GetDeviceNames() const;
+
   // Prepare the session to run the specified grappler item. This include
   // initializing all the model variables.
   virtual Status Initialize(const GrapplerItem& item) = 0;
@@ -85,7 +82,7 @@ class Cluster {
                      RunMetadata* metadata) = 0;
 
  protected:
-  std::vector<DeviceAttributes> devices_;
+  std::unordered_map<string, DeviceProperties> devices_;
   const int timeout_s_;
   SessionOptions options_;
   RunOptions run_options_;
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 09c8d55efda2085af6b152548902b8b4ade97295..540819bc5929552fa205128584a74071aea8f572 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/cc/training/queue_runner.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -57,6 +59,8 @@ SingleMachine::~SingleMachine() {
   // Reset the thread-pool so that there are no outstanding Session::Run(...)s
   // when we delete the session.
   thread_pool_.reset();
+
+  Reset(options_, {}).IgnoreError();
 }
 
 Status SingleMachine::Provision() {
@@ -65,16 +69,12 @@ Status SingleMachine::Provision() {
     return status;
   }
 
-  DeviceAttributes attr;
-  attr.set_name("/job:localhost/replica:0/task:0/cpu:0");
-  attr.set_device_type("CPU");
-  devices_.push_back(attr);
+  DeviceProperties attr = GetLocalCPUInfo();
+  devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
 
   for (int i = 0; i < num_gpus_; ++i) {
-    DeviceAttributes attr;
-    attr.set_name(strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i));
-    attr.set_device_type("GPU");
-    devices_.push_back(attr);
+    devices_[strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i)] =
+        GetLocalGPUInfo(i);
   }
   return Status::OK();
 }
@@ -111,6 +111,8 @@ Status SingleMachine::Run(const GraphDef& graph_def,
         for (auto node : *init_metadata_.mutable_cost_graph()->mutable_node()) {
           node.clear_compute_cost();
         }
+        // Also clear the timeline to save memory
+        init_metadata_.clear_step_stats();
       }
       for (int i = 0; i < queue_runner_defs_.size(); ++i) {
         std::unique_ptr<QueueRunner> queue_runner;
@@ -133,15 +135,17 @@ Status SingleMachine::Run(const GraphDef& graph_def,
     }
   }
 
-  TF_RETURN_IF_ERROR(RunWithTimeout(feed, fetch, metadata));
-
   if (metadata) {
-    // Add the costs of initialization and the queue runners.
-    metadata->MergeFrom(init_metadata_);
-    return coordinator_->ExportCostGraph(metadata->mutable_cost_graph());
+    TF_RETURN_IF_ERROR(RunWithTimeout(feed, fetch, metadata));
+    // Merge the costs of the initialization and the queue runners.
+    CostGraphDef queue_costs;
+    TF_RETURN_IF_ERROR(coordinator_->ExportCostGraph(&queue_costs));
+    MergeCosts(metadata->mutable_cost_graph(), init_metadata_.cost_graph(),
+               queue_costs);
   } else {
-    return Status::OK();
+    return RunWithTimeout(feed, fetch, nullptr);
   }
+  return Status::OK();
 }
 
 Status SingleMachine::RunWithTimeout(
@@ -249,5 +253,36 @@ Status SingleMachine::ResetSession() {
   return Status::OK();
 }
 
+void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
+                               const CostGraphDef& init_costs,
+                               const CostGraphDef& queue_costs) {
+  graph_costs->mutable_node()->Reserve(graph_costs->node_size() +
+                                       init_costs.node_size() +
+                                       queue_costs.node_size());
+  std::unordered_set<string> nodes_seen;
+  for (const auto& node : graph_costs->node()) {
+    nodes_seen.insert(node.name());
+  }
+
+  // The costs obtained by running the main graph could be more stable than
+  // the one we get from the queue runners since the queue runners run
+  // asynchronously.
+  for (const auto& node : queue_costs.node()) {
+    if (nodes_seen.find(node.name()) != nodes_seen.end()) {
+      continue;
+    }
+    graph_costs->add_node()->MergeFrom(node);
+  }
+
+  // Don't overwrite the costs with that generated during initialization since
+  // these are possibly outdated.
+  for (const auto& node : init_costs.node()) {
+    if (nodes_seen.find(node.name()) != nodes_seen.end()) {
+      continue;
+    }
+    graph_costs->add_node()->MergeFrom(node);
+  }
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index f69b11df5dae1e2deefc5d921bf0cd4bba2d43bf..f2773376e41e6da819e8bc1135474517ddd72a6c 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -47,6 +47,8 @@ class SingleMachine : public Cluster {
                         RunMetadata* run_metadata, int64 timeout_s);
   Status ResetSession();
   Status CloseSession(bool use_timeout);
+  void MergeCosts(CostGraphDef* graph_costs, const CostGraphDef& init_costs,
+                  const CostGraphDef& queue_costs);
 
   const int num_gpus_;
   std::unique_ptr<Session> session_;
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index 0572aa04be7211e43faaf5475a7f81cda18bdf48..17db48817e5556ea899808a651496f4ad8a0f04d 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -159,6 +159,121 @@ TEST_F(SingleMachineTest, InitializationMemory) {
   EXPECT_TRUE(found);
 }
 
+namespace {
+template <class T>
+inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
+  AttrValue attr_value;
+  SetAttrValue(value, &attr_value);
+  auto* attr_map = node->mutable_attr();
+  (*attr_map)[key] = attr_value;
+}
+template <>
+inline void SetNodeAttr(const string& key, const Tensor& tensor,
+                        NodeDef* node) {
+  TensorProto tensor_proto;
+  tensor.AsProtoTensorContent(&tensor_proto);
+  SetNodeAttr(key, tensor_proto, node);
+}
+
+}  // namespace
+
+TEST_F(SingleMachineTest, PersistentMemory) {
+  // Build a hashtable and its initialization graph.
+  GrapplerItem item;
+  const DataType key_dtype = DT_INT64;
+  const DataType data_dtype = DT_INT64;
+
+  NodeDef* hashtable_node = item.graph.add_node();
+  hashtable_node->set_op("HashTable");
+  hashtable_node->set_name("hash_table");
+  SetNodeAttr("key_dtype", key_dtype, hashtable_node);
+  SetNodeAttr("value_dtype", data_dtype, hashtable_node);
+
+  // Initial hashtable keys and values
+  NodeDef* keys_node = item.graph.add_node();
+  keys_node->set_op("Const");
+  keys_node->set_name("table_keys");
+  SetNodeAttr("dtype", key_dtype, keys_node);
+  Tensor keys(key_dtype, TensorShape{2});
+  keys.vec<int64>()(0) = 123;
+  keys.vec<int64>()(1) = 321;
+  SetNodeAttr("value", keys, keys_node);
+
+  NodeDef* values_node = item.graph.add_node();
+  values_node->set_op("Const");
+  values_node->set_name("table_values");
+  SetNodeAttr("dtype", data_dtype, values_node);
+  Tensor values(data_dtype, TensorShape{2});
+  values.vec<int64>()(0) = 789;
+  values.vec<int64>()(1) = 987;
+  SetNodeAttr("value", values, values_node);
+
+  // InitializeTable node
+  NodeDef* init_table_node = item.graph.add_node();
+  init_table_node->set_op("InitializeTable");
+  init_table_node->set_name("initialize_table");
+  SetNodeAttr("Tkey", key_dtype, init_table_node);
+  SetNodeAttr("Tval", data_dtype, init_table_node);
+  *init_table_node->add_input() = "hash_table";
+  *init_table_node->add_input() = "table_keys";
+  *init_table_node->add_input() = "table_values";
+  item.init_ops.push_back(init_table_node->name());
+
+  // Key to lookup
+  NodeDef* query_node = item.graph.add_node();
+  query_node->set_op("Const");
+  query_node->set_name("query");
+  SetNodeAttr("dtype", key_dtype, query_node);
+  Tensor query(key_dtype, TensorShape({}));
+  query.flat<int64>()(0) = 0;
+  SetNodeAttr("value", query, query_node);
+
+  // Default return value of hashtable lookup
+  NodeDef* default_value_node = item.graph.add_node();
+  default_value_node->set_op("Const");
+  default_value_node->set_name("default_table_value");
+  SetNodeAttr("dtype", data_dtype, default_value_node);
+  Tensor dflt(data_dtype, TensorShape({}));
+  dflt.flat<int64>()(0) = 456;
+  SetNodeAttr("value", dflt, default_value_node);
+
+  // HashTable lookup node
+  NodeDef* lookup_node = item.graph.add_node();
+  lookup_node->set_op("LookupTableFind");
+  lookup_node->set_name("table_lookup");
+  SetNodeAttr("Tin", key_dtype, lookup_node);
+  SetNodeAttr("Tout", data_dtype, lookup_node);
+  *lookup_node->add_input() = "hash_table";
+  *lookup_node->add_input() = "query";
+  *lookup_node->add_input() = "default_table_value";
+  item.fetch.push_back(lookup_node->name());
+
+  // Run the graph
+  TF_CHECK_OK(cluster_->Initialize(item));
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // Check the cost model.
+  bool found_table_init = false;
+  bool found_hashtable = false;
+  for (const auto& node : metadata.cost_graph().node()) {
+    if (node.name() == "hash_table") {
+      found_hashtable = true;
+      // Persistent memory usage should be 0 since it's recorded as part of the
+      // initialize_table op.
+      EXPECT_EQ(0, node.host_persistent_memory_size());
+      EXPECT_EQ(0, node.device_persistent_memory_size());
+    } else if (node.name() == "initialize_table") {
+      found_table_init = true;
+      // Persistent memory should hold 2 keys and 2 values.
+      EXPECT_LE(4 * sizeof(int64), node.host_persistent_memory_size());
+      EXPECT_EQ(0, node.device_persistent_memory_size());
+    }
+  }
+  EXPECT_TRUE(found_table_init);
+  EXPECT_TRUE(found_hashtable);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..592e4b789d0dcb7369e2f0c6db447eb9daa92870
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/utils.h"
+
+#include "third_party/eigen3/Eigen/Core"
+
+#if GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#include "cuda/include/cudnn.h"
+#endif
+
+#ifdef EIGEN_USE_LIBXSMM
+#include "include/libxsmm.h"
+#endif
+
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+
+namespace tensorflow {
+namespace grappler {
+
+DeviceProperties GetLocalCPUInfo() {
+  DeviceProperties device;
+  device.set_type("CPU");
+
+  device.set_vendor(port::CPUVendorIDString());
+  // Combine cpu family and model into the model string.
+  device.set_model(
+      strings::StrCat((port::CPUFamily() << 4) + port::CPUModelNum()));
+  device.set_frequency(port::NominalCPUFrequency() * 1e-6);
+  device.set_num_cores(port::NumSchedulableCPUs());
+  device.set_l1_cache_size(Eigen::l1CacheSize());
+  device.set_l2_cache_size(Eigen::l2CacheSize());
+  device.set_l3_cache_size(Eigen::l3CacheSize());
+
+  (*device.mutable_environment())["cpu_instruction_set"] =
+      Eigen::SimdInstructionSetsInUse();
+
+  (*device.mutable_environment())["eigen"] = strings::StrCat(
+      EIGEN_WORLD_VERSION, ".", EIGEN_MAJOR_VERSION, ".", EIGEN_MINOR_VERSION);
+#ifdef EIGEN_USE_LIBXSMM
+  (*device.mutable_environment())["libxsmm"] = LIBXSMM_VERSION;
+#endif
+
+  return device;
+}
+
+DeviceProperties GetLocalGPUInfo(int gpu_id) {
+  DeviceProperties device;
+  device.set_type("GPU");
+
+#if GOOGLE_CUDA
+  cudaDeviceProp properties;
+  cudaError_t error = cudaGetDeviceProperties(&properties, gpu_id);
+  if (error == cudaSuccess) {
+    device.set_vendor("NVidia");
+    device.set_model(properties.name);
+    device.set_frequency(properties.clockRate * 1e-3);
+    device.set_num_cores(properties.multiProcessorCount);
+    device.set_num_registers(properties.regsPerMultiprocessor);
+    // For compute capability less than 5, l1 cache size is configurable to
+    // either 16 KB or 48 KB. We use the initial configuration 16 KB here. For
+    // compute capability larger or equal to 5, l1 cache (unified with texture
+    // cache) size is 24 KB. This number may need to be updated for future
+    // compute capabilities.
+    device.set_l1_cache_size((properties.major < 5) ? 16 * 1024 : 24 * 1024);
+    device.set_l2_cache_size(properties.l2CacheSize);
+    device.set_l3_cache_size(0);
+    device.set_shared_memory_size_per_multiprocessor(
+        properties.sharedMemPerMultiprocessor);
+    device.set_memory_size(properties.totalGlobalMem);
+    // 8 is the number of bits per byte. 2 is accounted for
+    // double data rate (DDR).
+    device.set_bandwidth(properties.memoryBusWidth / 8 *
+                         properties.memoryClockRate * 2);
+  }
+
+  (*device.mutable_environment())["architecture"] =
+      strings::StrCat(properties.major, ".", properties.minor);
+  (*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
+  (*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION);
+#endif
+
+  return device;
+}
+
+DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
+  if (device.type == "CPU") {
+    return GetLocalCPUInfo();
+  } else if (device.type == "GPU") {
+    if (device.has_id) {
+      return GetLocalGPUInfo(device.id);
+    } else {
+      return GetLocalGPUInfo(0);
+    }
+  }
+  DeviceProperties result;
+  result.set_type("UNKNOWN");
+  return result;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/utils.h b/tensorflow/core/grappler/clusters/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..191942040a1fdd276bb50f799ce314389c2cb0fe
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Returns the DeviceProperties of the CPU on which grappler is running.
+DeviceProperties GetLocalCPUInfo();
+
+// Returns the DeviceProperties for the specified GPU attached to the server on
+// which grappler is running.
+DeviceProperties GetLocalGPUInfo(int gpu_id);
+
+// Returns the DeviceProperties of the specified device
+DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ca4c03dbb6dd7c2c578b0d86de2ecbe16f8e652
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+
+namespace tensorflow {
+namespace grappler {
+
+VirtualCluster::VirtualCluster(
+    const std::unordered_map<string, DeviceProperties>& devices)
+    : Cluster(0) {
+  devices_ = devices;
+}
+
+VirtualCluster::~VirtualCluster() {}
+
+Status VirtualCluster::Provision() { return Status::OK(); }
+
+Status VirtualCluster::Initialize(const GrapplerItem& item) {
+  return Status::OK();
+}
+
+Status VirtualCluster::Run(const GraphDef& item,
+                           const std::vector<std::pair<string, Tensor>>& feed,
+                           const std::vector<string>& fetch,
+                           RunMetadata* metadata) {
+  return Status::OK();
+
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd8436a9870e97457b67474870ad6b46215cf9ee
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+
+#include <unordered_map>
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Create a simple cluster that lists the devices (and their properties)
+// available in a TensorFlow session. This cluster doesn't allow running an
+// actual graph. It is useful however when used in conjusction with costs models
+// that aren't based on the execution of the graph.
+class VirtualCluster : public Cluster {
+ public:
+  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices);
+
+  ~VirtualCluster() override;
+
+  Status Provision() override;
+  Status Initialize(const GrapplerItem& item) override;
+  Status Run(const GraphDef& item,
+             const std::vector<std::pair<string, Tensor>>& feed,
+             const std::vector<string>& fetch, RunMetadata* metadata) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index d078d9af09e72bda2e64877bf0179982670e5e4f..43c727f381099b3ef963ba275e06928845aea839 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -25,7 +25,9 @@ tf_proto_library(
     name = "op_performance_data",
     srcs = ["op_performance_data.proto"],
     cc_api_version = 2,
-    protodeps = ["//tensorflow/core:protos_all"],
+    protodeps = [
+        "//tensorflow/core:protos_all",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -37,7 +39,7 @@ cc_library(
     deps = [
         ":op_performance_data_cc",
         ":utils",
-        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
@@ -50,11 +52,13 @@ cc_test(
     args = ["--heap_check=local"],  # The GPU tracer leaks memory
     deps = [
         ":graph_properties",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:grappler_item_builder",
         "//tensorflow/core/grappler/clusters:single_machine",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
@@ -88,15 +92,34 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "robust_stats",
+    srcs = ["robust_stats.cc"],
+    hdrs = ["robust_stats.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_test(
+    name = "robust_stats_test",
+    srcs = ["robust_stats_test.cc"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    defines = if_cuda(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
     deps = [
         ":op_performance_data_cc",
         "//third_party/eigen3",
-        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core/grappler/clusters:utils",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -114,3 +137,99 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
+
+cc_library(
+    name = "virtual_placer",
+    srcs = ["virtual_placer.cc"],
+    hdrs = ["virtual_placer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_library(
+    name = "virtual_scheduler",
+    srcs = ["virtual_scheduler.cc"],
+    hdrs = ["virtual_scheduler.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+    ],
+)
+
+cc_library(
+    name = "measuring_cost_estimator",
+    srcs = ["measuring_cost_estimator.cc"],
+    hdrs = ["measuring_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+cc_library(
+    name = "op_level_cost_estimator",
+    srcs = ["op_level_cost_estimator.cc"],
+    hdrs = ["op_level_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cost_estimator",
+        ":op_performance_data_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/grappler/clusters:utils",
+    ],
+)
+
+cc_test(
+    name = "op_level_cost_estimator_test",
+    srcs = ["op_level_cost_estimator_test.cc"],
+    deps = [
+        ":op_level_cost_estimator",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "analytical_cost_estimator",
+    srcs = ["analytical_cost_estimator.cc"],
+    hdrs = ["analytical_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cost_estimator",
+        ":graph_properties",
+        ":op_level_cost_estimator",
+        ":op_performance_data_cc",
+        ":utils",
+        ":virtual_placer",
+        ":virtual_scheduler",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04ffc58cdad564da4b58f6cf4ca34d458e47a36d
--- /dev/null
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -0,0 +1,128 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
+
+#include <limits>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+AnalyticalCostEstimator::AnalyticalCostEstimator(Cluster* cluster,
+                                                 bool use_static_shapes)
+    : cluster_(cluster), use_static_shapes_(use_static_shapes) {}
+
+Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
+  item_ = item;
+  return Status::OK();
+}
+
+Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
+                                             CostGraphDef* cost_graph,
+                                             Costs* costs) const {
+  GrapplerItem item = item_;
+  item.graph = optimized_graph;
+  GraphProperties properties(item);
+  Status status;
+  if (use_static_shapes_) {
+    status = properties.InferStatically();
+  } else {
+    status = properties.InferDynamically(cluster_);
+  }
+
+  if (!status.ok()) {
+    costs->execution_time = Costs::Duration::max();
+    return status;
+  }
+
+  std::unordered_map<string, CostGraphDef::Node*> name_to_cost;
+  if (cost_graph) {
+    for (auto& node : *cost_graph->mutable_node()) {
+      name_to_cost[node.name()] = &node;
+    }
+  }
+  std::vector<string> inaccurate_nodes;
+  VirtualScheduler scheduler(optimized_graph, item_.fetch);
+  VirtualPlacer placer(cluster_);
+  Costs node_costs;
+  do {
+    const NodeDef* node = scheduler.GetCurrNode();
+    std::vector<OpInfo::TensorProperties> inputs =
+        properties.GetInputProperties(node->name());
+
+    DeviceProperties device = placer.get_device(*node);
+    OpInfo op_info;
+    op_info.set_op(node->op());
+    *op_info.mutable_attr() = node->attr();
+    for (auto& input : inputs) {
+      op_info.add_inputs()->Swap(&input);
+    }
+    op_info.mutable_device()->Swap(&device);
+
+    node_costs = node_estimator_.PredictCosts(op_info);
+    if (node_costs.inaccurate) {
+      inaccurate_nodes.push_back(node->name());
+    }
+    if (cost_graph) {
+      auto it = name_to_cost.find(node->name());
+      CostGraphDef::Node* cost_node;
+      if (it != name_to_cost.end()) {
+        cost_node = it->second;
+      } else {
+        cost_node = cost_graph->add_node();
+        cost_node->set_name(node->name());
+      }
+      string device_name = properties.GetDeviceName(node->name());
+      cost_node->set_device(device_name);
+      cost_node->set_compute_cost(
+          node_costs.execution_time.asMicroSeconds().count());
+      cost_node->set_compute_time(
+          node_costs.compute_time.asMicroSeconds().count());
+      cost_node->set_memory_time(
+          node_costs.memory_time.asMicroSeconds().count());
+      std::vector<OpInfo::TensorProperties> outputs =
+          properties.GetOutputProperties(node->name());
+      for (const auto& output : outputs) {
+        auto output_info = cost_node->add_output_info();
+        output_info->set_dtype(output.dtype());
+        auto shape = output_info->mutable_shape();
+        *shape = output.shape();
+      }
+    }
+  } while (scheduler.MarkCurrNodeExecuted(node_costs));
+
+  *costs = scheduler.Summary();
+  VLOG(1) << inaccurate_nodes.size() << " out of "
+          << optimized_graph.node_size()
+          << " nodes have inaccurate time estimation";
+  for (const auto& node : inaccurate_nodes) {
+    VLOG(2) << "Node with inaccurate time estimation: " << node;
+  }
+  return Status::OK();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
new file mode 100644
index 0000000000000000000000000000000000000000..03e7faa4ff5c722c890f1318992174d98cfd246a
--- /dev/null
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item based on the theoretical
+// performance of the hardware that will run the model.
+class AnalyticalCostEstimator : public CostEstimator {
+ public:
+  // Does not take ownership of cluster.
+  explicit AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes);
+  ~AnalyticalCostEstimator() override {}
+
+  // Initalizes the estimator for the specified grappler item.
+  // This implementation always returns OK.
+  Status Initialize(const GrapplerItem& item) override;
+
+  // Predict the performance of each node of the optimized graph and annotate
+  // the CostGraphDef with the corresponding estimates. Also returns the
+  // expected latency for the whole graph.
+  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
+                      Costs* overall_latency) const override;
+
+ private:
+  Cluster* cluster_;  // Not owned.
+  GrapplerItem item_;
+  OpLevelCostEstimator node_estimator_;
+  bool use_static_shapes_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index 093b7e29dc812c03ecdb742aea50fb689b19b024..b3fb3522a39eadf08f9a0f18cb68bfc3bfbb0028 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -42,7 +42,8 @@ struct Costs {
   struct MicroSeconds : std::chrono::microseconds {
     MicroSeconds() : std::chrono::microseconds(0) {}
     MicroSeconds(double d) : std::chrono::microseconds(static_cast<int64>(d)) {}
-    MicroSeconds(std::chrono::microseconds& d) : std::chrono::microseconds(d) {}
+    MicroSeconds(const std::chrono::microseconds& d)
+        : std::chrono::microseconds(d) {}
     MicroSeconds& operator=(const std::chrono::microseconds& d) {
       std::chrono::microseconds::operator=(d);
       return *this;
@@ -51,7 +52,8 @@ struct Costs {
   struct NanoSeconds : std::chrono::nanoseconds {
     NanoSeconds() : std::chrono::nanoseconds(0) {}
     NanoSeconds(double d) : std::chrono::nanoseconds(static_cast<int64>(d)) {}
-    NanoSeconds(std::chrono::nanoseconds& d) : std::chrono::nanoseconds(d) {}
+    NanoSeconds(const std::chrono::nanoseconds& d)
+        : std::chrono::nanoseconds(d) {}
     NanoSeconds& operator=(const std::chrono::nanoseconds& d) {
       std::chrono::nanoseconds::operator=(d);
       return *this;
@@ -90,6 +92,8 @@ struct Costs {
   int64 max_per_op_buffers;    // Sum of all buffers used by the ops.
   int64 max_per_op_streaming;  // Ignore largest input buffer, assuming it
                                // streams from main memory.
+  // If the time estimation is inaccurate.
+  bool inaccurate = false;
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Costs::MicroSeconds d) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 4edcdccdfe1c3d70f4c915aff3a378f634b50834..b0e69d44edd129eaa29ac282540b7791cf377fd0 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -26,10 +29,81 @@ namespace grappler {
 Status GraphProperties::InferStatically() {
   Graph graph(OpRegistry::Global());
   ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  shape_refiner.set_require_shape_inference_fns(false);
   ImportGraphDefOptions options;
   Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
   TF_RETURN_IF_ERROR(s);
 
+  // List the resources and the nodes using them
+  std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
+  for (const Node* const node : graph.nodes()) {
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      if (node->input_type(i) == DataType::DT_RESOURCE) {
+        const Node* resource;
+        TF_CHECK_OK(node->input_node(i, &resource));
+        resources[resource].insert(node);
+      }
+    }
+  }
+
+  // If we found a resource, try to propagate the shapes through it.
+  bool done = true;
+  do {
+    std::queue<const Node*> new_shapes;
+    for (const auto& resource_data : resources) {
+      const Node* qnode = resource_data.first;
+      StringPiece type(qnode->type_string());
+      if (!type.ends_with("QueueV2")) {
+        continue;
+      }
+      auto qctx = shape_refiner.GetContext(qnode);
+      if (!qctx) {
+        continue;
+      }
+      DataType queue_type = qctx->output_handle_dtype(0);
+      shape_inference::ShapeHandle queue_shp = qctx->output_handle_shape(0);
+      if (qctx->FullyDefined(queue_shp) && queue_type != DT_INVALID) {
+        continue;
+      }
+
+      for (const auto& node : resource_data.second) {
+        auto ctx = shape_refiner.GetContext(node);
+        if (!ctx) {
+          continue;
+        }
+        if (node->type_string().find("Enqueue") != std::string::npos) {
+          if (ctx->num_inputs() == 2) {
+            const DataType dtype = node->input_type(1);
+            if (queue_type == DT_INVALID) {
+              queue_type = dtype;
+            } else {
+              CHECK_EQ(queue_type, dtype);
+            }
+            shape_inference::ShapeHandle shp = ctx->input(1);
+            TF_RETURN_IF_ERROR(qctx->Merge(queue_shp, shp, &queue_shp));
+          }
+        }
+      }
+      if (qctx->set_output_handle_dtype(0, queue_type) |
+          qctx->MergeOutputHandleShape(0, queue_shp)) {
+        new_shapes.push(qnode);
+      }
+    }
+    // Propagate the shapes in the transitive fan-out of the queue.
+    done = new_shapes.empty();
+    while (!new_shapes.empty()) {
+      const Node* n = new_shapes.front();
+      new_shapes.pop();
+      for (const Node* fanout : n->out_nodes()) {
+        bool updated = false;
+        TF_RETURN_IF_ERROR(shape_refiner.UpdateNode(fanout, &updated));
+        if (updated) {
+          new_shapes.push(fanout);
+        }
+      }
+    }
+  } while (!done);
+
   for (const Node* const node : graph.nodes()) {
     VLOG(1) << "<Node> " << node->name();
     auto ctx = shape_refiner.GetContext(node);
@@ -77,8 +151,8 @@ Status GraphProperties::InferStatically() {
 
     if (!node->assigned_device_name().empty()) {
       device_names_[node->name()] = node->assigned_device_name();
-    } else if (!node->def().device().empty()) {
-      device_names_[node->name()] = node->def().device();
+    } else if (!node->requested_device().empty()) {
+      device_names_[node->name()] = node->requested_device();
     } else {
       device_names_[node->name()] = "not set";
     }
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 32683644fbbe7293a821605a8467aa428273a1fc..be5ae3c3646a6aec0bf177a0e7c666b9365023d1 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
@@ -129,6 +132,328 @@ TEST_F(GraphPropertiesTest, DynamicProperties) {
   }
 }
 
+TEST_F(GraphPropertiesTest, VarHandles) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", TensorShape({3, 7}))
+                  .Finalize(item.graph.add_node()));
+
+  TF_CHECK_OK(NodeDefBuilder("VarRead", "ReadVariableOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Input("Var", 0, DT_RESOURCE)
+                  .Finalize(item.graph.add_node()));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props = properties.GetOutputProperties("VarRead");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_FLOAT, prop.dtype());
+  EXPECT_FALSE(prop.shape().unknown_rank());
+  EXPECT_EQ(2, prop.shape().dim_size());
+  EXPECT_EQ(3, prop.shape().dim(0).size());
+  EXPECT_EQ(7, prop.shape().dim(1).size());
+}
+
+TEST_F(GraphPropertiesTest, Queues) {
+  // Create a graph with known input shapes, and propagate the shapes through a
+  // couple of queues.
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+
+  auto q1 = ops::FIFOQueue(root.WithOpName("Queue1"), {DataType::DT_FLOAT});
+  Output rnd =
+      ops::RandomNormal(root.WithOpName("rnd"), {3, 7}, DataType::DT_FLOAT);
+  Output square1 = ops::Square(root.WithOpName("Square1"), rnd);
+  auto enqueue1 = ops::QueueEnqueue(root.WithOpName("Enqueue1"), q1, {square1});
+  auto dequeue1 =
+      ops::QueueDequeue(root.WithOpName("Dequeue1"), q1, {DataType::DT_FLOAT});
+
+  auto q2 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue2"), {DataType::DT_FLOAT});
+  Output square2 = ops::Square(root.WithOpName("Square2"), dequeue1[0]);
+  auto enqueue2 = ops::QueueEnqueue(root.WithOpName("Enqueue2"), q2, {square2});
+  auto dequeue2 =
+      ops::QueueDequeue(root.WithOpName("Dequeue2"), q2, {DataType::DT_FLOAT});
+
+  // Create a queue that feeds itself.
+  auto q3 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue3"), {DataType::DT_FLOAT});
+  auto dequeue3 =
+      ops::QueueDequeue(root.WithOpName("Dequeue3"), q3, {DataType::DT_FLOAT});
+  auto merge3 = ops::Merge(root.WithOpName("Merge3"), {dequeue3[0], square2});
+  auto enqueue3 =
+      ops::QueueEnqueue(root.WithOpName("Enqueue3"), q3, {merge3.output});
+
+  auto q4 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue4"), {DataType::DT_FLOAT});
+  auto enqueue4 = ops::QueueEnqueue(root.WithOpName("Enqueue4"), q4, {square2});
+  auto enqueue4_2 =
+      ops::QueueEnqueue(root.WithOpName("Enqueue4_2"), q4, {dequeue3[0]});
+  auto dequeue4 =
+      ops::QueueDequeue(root.WithOpName("Dequeue4"), q4, {DataType::DT_FLOAT});
+
+  GrapplerItem item;
+  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props1 = properties.GetOutputProperties("Dequeue1");
+  EXPECT_EQ(1, props1.size());
+  const OpInfo::TensorProperties& prop1 = props1[0];
+  EXPECT_EQ(DT_FLOAT, prop1.dtype());
+  EXPECT_FALSE(prop1.shape().unknown_rank());
+  EXPECT_EQ(2, prop1.shape().dim_size());
+  EXPECT_EQ(3, prop1.shape().dim(0).size());
+  EXPECT_EQ(7, prop1.shape().dim(1).size());
+
+  const auto props2 = properties.GetOutputProperties("Dequeue2");
+  EXPECT_EQ(1, props2.size());
+  const OpInfo::TensorProperties& prop2 = props2[0];
+  EXPECT_EQ(DT_FLOAT, prop2.dtype());
+  EXPECT_FALSE(prop2.shape().unknown_rank());
+  EXPECT_EQ(2, prop2.shape().dim_size());
+  EXPECT_EQ(3, prop2.shape().dim(0).size());
+  EXPECT_EQ(7, prop2.shape().dim(1).size());
+
+  // The dequeue3 op shape is unknown. The square2 op shape is known. Verify
+  // that we merge the 2 properly to determine the shape of the data coming out
+  // of the queue.
+  const auto props4 = properties.GetOutputProperties("Dequeue4");
+  EXPECT_EQ(1, props4.size());
+  const OpInfo::TensorProperties& prop4 = props4[0];
+  EXPECT_EQ(DT_FLOAT, prop4.dtype());
+  EXPECT_FALSE(prop4.shape().unknown_rank());
+  EXPECT_EQ(2, prop4.shape().dim_size());
+  EXPECT_EQ(3, prop4.shape().dim(0).size());
+  EXPECT_EQ(7, prop4.shape().dim(1).size());
+}
+
+TEST_F(GraphPropertiesTest, Loops) {
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i = tf.constant(0)
+       c = lambda i: tf.less(i, 10)
+       b = lambda i: tf.add(i, 1)
+       r = tf.while_loop(c, b, [i])
+       with open('/tmp/graph.txt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/Add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 11
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props = properties.GetOutputProperties("while/Exit");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_INT32, prop.dtype());
+  EXPECT_TRUE(prop.shape().unknown_rank());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9262883b2a7bae1ade9d8fffb6680e1808d7e53b
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -0,0 +1,133 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
+
+#include <limits>
+
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+MeasuringCostEstimator::MeasuringCostEstimator(Cluster* cluster,
+                                               int measurement_steps,
+                                               int measurement_threads)
+    : measurement_steps_(measurement_steps),
+      measurement_threads_(measurement_threads) {
+  CHECK_GE(measurement_steps, 1);
+  if (measurement_threads > 0) {
+    thread_pool_.reset(new thread::ThreadPool(
+        Env::Default(), SanitizeThreadSuffix("measurements"),
+        measurement_threads));
+  }
+  cluster_ = cluster;
+}
+
+Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
+  feed_ = item.feed;
+  fetch_ = item.fetch;
+  return cluster_->Initialize(item);
+}
+
+Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
+                                            CostGraphDef* cost_graph,
+                                            Costs* costs) const {
+  std::vector<double> times(measurement_steps_);
+  BlockingCounter barrier(measurement_steps_);
+
+  mutex status_mu;
+  Status status;
+
+  auto measurement_fn = [&](const int step) {
+    const Costs::MicroSeconds start = Env::Default()->NowMicros();
+
+    RunMetadata metadata;
+    const Status local_status =
+        cluster_->Run(optimized_graph, feed_, fetch_, &metadata);
+    {
+      mutex_lock lock(status_mu);
+      status.Update(local_status);
+    }
+    if (step < 0) {
+      // Discard the first iteration as it triggers the warmup, and therefore
+      // takes much longer than a normal step.
+      return;
+    }
+    if (!local_status.ok()) {
+      // Discard the data if the run wasn't successful.
+      barrier.DecrementCount();
+      return;
+    }
+
+    const Costs::MicroSeconds finish = Env::Default()->NowMicros();
+    const double time = (finish - start).count() * 1e3;
+    times[step] = time;
+
+    if (cost_graph && (step + 1 == measurement_steps_)) {
+      metadata.mutable_cost_graph()->Swap(cost_graph);
+    }
+
+    barrier.DecrementCount();
+  };
+
+  // Initialize the computation and warm up TensorFlow.
+  measurement_fn(-1);
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to run start measurements: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    return status;
+  }
+
+  // Run "measurement_steps_" and measure the time.
+  if (measurement_threads_ > 0) {
+    for (int i = 0; i < measurement_steps_; ++i) {
+      thread_pool_->Schedule([i, &measurement_fn]() { measurement_fn(i); });
+    }
+    barrier.Wait();
+  } else {
+    for (int i = 0; i < measurement_steps_ && status.ok(); ++i) {
+      measurement_fn(i);
+    }
+  }
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to measure graph performance: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    costs->max_execution_time = Costs::Duration::max();
+    costs->min_execution_time = 0;
+    return status;
+  }
+
+  // Compute the average time of the measure steps. Use Huber statistics
+  // to filter out outliers.
+  RobustStats stats(times);
+  costs->execution_time = Costs::Duration(stats.mean());
+  costs->max_execution_time = Costs::Duration(stats.hi());
+  costs->min_execution_time = Costs::Duration(stats.lo());
+
+  return Status::OK();
+}
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
new file mode 100644
index 0000000000000000000000000000000000000000..a84853f6c7179cff7f0954b111f0ab187cd75a62
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#define TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item by actually running the
+// corresponding TensorFlow graph on the specified cluster and measuring the
+// runtimes.
+class MeasuringCostEstimator : public CostEstimator {
+ public:
+  // Run the model for measurement_steps to measure its average cost.
+  // When measurement_threads is greater than 0, use a threadpool of as many
+  // threads to run the measurements; otherwise, run them serially. Does not
+  // take ownership of cluster.
+  explicit MeasuringCostEstimator(Cluster* cluster, int measurement_steps,
+                                  int measurement_threads);
+  ~MeasuringCostEstimator() override {}
+
+  // Initalizes the estimator for the specified grappler item.
+  // This implementation always returns OK.
+  Status Initialize(const GrapplerItem& item) override;
+
+  // Runs the optimized version of the graph on the cluster, measure
+  // the runtimes of each operation, and annotated the CostGraphDef
+  // with the corresponding measurements.
+  // Returns the average latency for the whole graph.
+  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
+                      Costs* overall_cost) const override;
+
+ private:
+  Cluster* cluster_;  // Not owned.
+  int measurement_steps_;
+  int measurement_threads_;
+  std::vector<std::pair<string, Tensor>> feed_;
+  std::vector<string> fetch_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5298dc756567f5c5f4631f84606b2f9d8ddd3159
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -0,0 +1,561 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr int kOpsPerMac = 2;
+constexpr char kConv2d[] = "Conv2D";
+constexpr char kConv2dBackPropFilter[] = "Conv2DBackpropFilter";
+constexpr char kConv2dBackPropInput[] = "Conv2DBackpropInput";
+constexpr char kMatMul[] = "MatMul";
+constexpr char kSparseMatMul[] = "SparseMatMul";
+constexpr char kIdentity[] = "Identity";
+constexpr char kNoOp[] = "NoOp";
+constexpr char kReshape[] = "Reshape";
+
+OpLevelCostEstimator::OpLevelCostEstimator() {
+  // Syntactic sugar to build and return a lambda that takes an OpInfo and
+  // returns a cost.
+  typedef Costs (OpLevelCostEstimator::*CostImpl)(const OpInfo& op_feature)
+      const;
+  auto wrap = [this](CostImpl impl) -> std::function<Costs(const OpInfo&)> {
+    return [this, impl](const OpInfo& op) { return (this->*impl)(op); };
+  };
+
+  device_cost_impl_ = {
+      {kConv2d, wrap(&OpLevelCostEstimator::PredictConv2D)},
+      {kConv2dBackPropFilter,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackPropFilter)},
+      {kConv2dBackPropInput,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackPropInput)},
+      {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)}};
+}
+
+Costs OpLevelCostEstimator::PredictCosts(const OpInfo& op_features) const {
+  auto it = device_cost_impl_.find(op_features.op());
+  if (it == device_cost_impl_.end()) {
+    VLOG(1) << "Missing implementation for op: " << op_features.op();
+    Costs costs;
+    costs = DummyExecutionTime(op_features);
+    return costs;
+  }
+
+  std::function<Costs(const OpInfo&)> estimator = it->second;
+  Costs costs = estimator(op_features);
+  VLOG(1) << "Operation " << op_features.op() << " takes "
+          << costs.execution_time.count() << " ns.";
+  return costs;
+}
+
+std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
+    const DeviceProperties& device) const {
+  double gflops = -1;
+  double bandwidth = -1;
+  if (device.bandwidth() > 0) {
+    bandwidth = device.bandwidth() / 1e6;
+  }
+
+  if (device.type() == "CPU") {
+    DeviceProperties local_cpu;
+    if (device.num_cores() <= 0 || device.frequency() <= 0) {
+      local_cpu = GetLocalCPUInfo();
+    } else {
+      local_cpu = device;
+    }
+
+    // Check if vector instructions are available, and refine performance
+    // prediction based on this.
+    // Frequencies are stored in MHz in the DeviceProperties.
+    gflops = local_cpu.num_cores() * local_cpu.frequency() * 1e-3;
+    if (bandwidth < 0) {
+      if (local_cpu.bandwidth() > 0) {
+        bandwidth = local_cpu.bandwidth() / 1e6;
+      } else {
+        bandwidth = 32;
+      }
+    }
+  } else if (device.type() == "GPU") {
+    const DeviceProperties local_gpu = GetLocalGPUInfo(0);
+    const string architecture = local_gpu.environment().at("architecture");
+    int cores_per_multiprocessor;
+    if (architecture < "3") {
+      // Fermi
+      cores_per_multiprocessor = 32;
+    } else if (architecture < "4") {
+      // Kepler
+      cores_per_multiprocessor = 192;
+    } else if (architecture < "6") {
+      //  Maxwell
+      cores_per_multiprocessor = 128;
+    } else {
+      // Pascal.
+      cores_per_multiprocessor = 64;
+    }
+    gflops = local_gpu.num_cores() * local_gpu.frequency() * 1e-3 *
+             cores_per_multiprocessor * kOpsPerMac;
+    if (bandwidth < 0) {
+      CHECK(local_gpu.bandwidth() > 0);
+      bandwidth = local_gpu.bandwidth() / 1e6;
+    }
+  }
+
+  return std::make_pair(gflops, bandwidth);
+}
+
+Costs OpLevelCostEstimator::DummyExecutionTime(
+    const OpInfo& op_features) const {
+  Costs costs = PredictOpCountBasedCost(0, op_features);
+  costs.inaccurate = true;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictOpCountBasedCost(
+    double operations, const OpInfo& op_features) const {
+  std::pair<double, double> device_perf = GetDeviceInfo(op_features.device());
+  Costs::NanoSeconds compute_cost(operations / device_perf.first);
+  VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9
+          << " Execution Time (ns):" << compute_cost.count();
+
+  bool found_unknown_shapes = false;
+  double total_input_size =
+      CalculateInputSize(op_features, &found_unknown_shapes);
+  double total_output_size =
+      CalculateOutputSize(op_features, &found_unknown_shapes);
+  double total_io_size = total_input_size + total_output_size;
+
+  Costs::NanoSeconds memory_cost(total_io_size / device_perf.second);
+  VLOG(1) << "Op:" << op_features.op() << " Size (KB):" << (total_io_size) / 1e3
+          << " Memory Time (ns):" << memory_cost.count();
+
+  Costs costs;
+  costs.compute_time = compute_cost;
+  costs.memory_time = memory_cost;
+  costs.execution_time = compute_cost + memory_cost;
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+int64 OpLevelCostEstimator::CountConv2DOperations(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  return CountConv2DOperations(op_features, nullptr, found_unknown_shapes);
+}
+
+namespace {
+
+string GetDataFormat(const OpInfo& op_features) {
+  string data_format = "NHWC";  // Default format.
+  if (op_features.attr().find("data_format") != op_features.attr().end()) {
+    data_format = op_features.attr().at("data_format").s();
+  }
+  return data_format;
+}
+
+Padding GetPadding(const OpInfo& op_features) {
+  if (op_features.attr().find("padding") != op_features.attr().end() &&
+      op_features.attr().at("padding").s() == "VALID") {
+    return Padding::VALID;
+  }
+  return Padding::SAME;  // Default padding.
+}
+
+std::vector<int64> GetStrides(const OpInfo& op_features) {
+  if (op_features.attr().find("strides") != op_features.attr().end()) {
+    const auto strides = op_features.attr().at("strides").list().i();
+    return {strides[0], strides[1], strides[2], strides[3]};
+  }
+  return {1, 1, 1, 1};
+}
+
+int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
+                    const Padding& padding) {
+  // Logic for calculating output shape is from GetWindowedOutputSizeVerbose()
+  // function in third_party/tensorflow/core/framework/common_shape_fns.cc.
+  if (padding == Padding::VALID) {
+    return (input - filter + stride) / stride;
+  } else {  // SAME.
+    return (input + stride - 1) / stride;
+  }
+}
+
+// Return a minimum shape if the shape is unknown. If known, return the original
+// shape.
+TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
+                                      int rank, bool* found_unknown_shapes) {
+  auto shape = original_shape;
+  if (shape.unknown_rank()) {
+    *found_unknown_shapes = true;
+  }
+  if (shape.unknown_rank() || shape.dim_size() == 0) {
+    TensorShapeProto::Dim dim;
+    VLOG(1) << "WARNING: Use minimum shape because the shape is unknown.";
+    // The size of each dimension is at least 1, if unknown.
+    dim.set_size(1);
+    for (int i = 0; i < rank; i++) {
+      *shape.add_dim() = dim;
+    }
+  } else {
+    CHECK_EQ(shape.dim_size(), rank);
+    for (int i = 0; i < rank; i++) {
+      if (shape.dim(i).size() == -1) {
+        *found_unknown_shapes = true;
+        VLOG(1)
+            << "WARNING: Use minimum dim size 1 because the shape is unknown.";
+        // The size of each dimension is at least 1, if unknown.
+        shape.mutable_dim(i)->set_size(1);
+      }
+    }
+  }
+  return shape;
+}
+}  // namespace
+
+// Helper to translate the positional arguments into named fields.
+OpLevelCostEstimator::ConvolutionDimensions
+OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
+    const TensorShapeProto& original_image_shape,
+    const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+    bool* found_unknown_shapes) {
+  auto image_shape =
+      MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
+  auto filter_shape =
+      MaybeGetMinimumShape(original_filter_shape, 4, found_unknown_shapes);
+
+  int x_index, y_index, channel_index;
+  const string& data_format = GetDataFormat(op_features);
+  if (data_format == "NCHW") {
+    x_index = 2;
+    y_index = 3;
+    channel_index = 1;
+  } else {
+    x_index = 1;
+    y_index = 2;
+    channel_index = 3;
+  }
+  int64 batch = image_shape.dim(0).size();
+  int64 ix = image_shape.dim(x_index).size();
+  int64 iy = image_shape.dim(y_index).size();
+  int64 iz = image_shape.dim(channel_index).size();
+  int64 kx = filter_shape.dim(0).size();
+  int64 ky = filter_shape.dim(1).size();
+  std::vector<int64> strides = GetStrides(op_features);
+  const auto padding = GetPadding(op_features);
+  int64 sx = strides[x_index];
+  int64 sy = strides[y_index];
+  int64 ox = GetOutputSize(ix, kx, sx, padding);
+  int64 oy = GetOutputSize(iy, ky, sy, padding);
+  int64 oz = filter_shape.dim(3).size();
+  // Only check equality when both sizes are known (in other words, when
+  // neither is set to a minimum dimension size of 1).
+  if (iz != 1 && filter_shape.dim(2).size() != 1) {
+    CHECK_EQ(iz, filter_shape.dim(2).size());
+  } else {
+    iz = std::max<int64>(iz, filter_shape.dim(2).size());
+  }
+  OpLevelCostEstimator::ConvolutionDimensions conv_dims = {
+      batch, ix, iy, iz, kx, ky, oz, ox, oy, sx, sy, padding};
+
+  VLOG(1) << "Batch Size:" << batch;
+  VLOG(1) << "Image Dims:" << ix << "," << iy;
+  VLOG(1) << "Input Features:" << iz;
+  VLOG(1) << "Kernel Dims:" << kx << "," << ky;
+  VLOG(1) << "Output Features:" << oz;
+  VLOG(1) << "Output Dims:" << ox << "," << oy;
+  VLOG(1) << "Strides:" << sx << "," << sy;
+  VLOG(1) << "Padding:" << (padding == Padding::VALID ? "VALID" : "SAME");
+  return conv_dims;
+}
+
+int64 OpLevelCostEstimator::CountConv2DOperations(
+    const OpInfo& op_features, ConvolutionDimensions* conv_info,
+    bool* found_unknown_shapes) const {
+  if (op_features.op() != kConv2d) {
+    LOG(ERROR) << "Invalid Operation";
+    return 0;
+  }
+  ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
+      op_features.inputs(0).shape(), op_features.inputs(1).shape(), op_features,
+      found_unknown_shapes);
+
+  int64 ops = conv_dims.batch;
+  ops *= conv_dims.ox * conv_dims.oy;
+  ops *= conv_dims.kx * conv_dims.ky;
+  ops *= conv_dims.iz * conv_dims.oz;
+  ops *= kOpsPerMac;
+  VLOG(1) << "Operations for Conv2D" << ops;
+
+  if (conv_info != nullptr) {
+    *conv_info = conv_dims;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CountMatMulOperations(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  return CountMatMulOperations(op_features, nullptr, found_unknown_shapes);
+}
+
+int64 OpLevelCostEstimator::CountMatMulOperations(
+    const OpInfo& op_features, MatMulDimensions* mat_mul,
+    bool* found_unknown_shapes) const {
+  double ops = 0;
+
+  // TODO(nishantpatil): Create separate estimator for Sparse Matmul
+  if ((op_features.op() != kMatMul) && (op_features.op() != kSparseMatMul)) {
+    LOG(ERROR) << "Invalid Operation";
+    return ops;
+  }
+
+  // first matrix
+  auto& a_matrix = op_features.inputs(0);
+  auto& b_matrix = op_features.inputs(1);
+
+  bool transpose_a = false;
+  bool transpose_b = false;
+
+  double m_dim, n_dim, k_dim, k_dim_b = 0;
+
+  for (const auto& item : op_features.attr()) {
+    VLOG(1) << "Key:" << item.first
+            << " Value:" << SummarizeAttrValue(item.second);
+    if (item.first == "transpose_a" && item.second.b() == true)
+      transpose_a = true;
+    if (item.first == "transpose_b" && item.second.b() == true)
+      transpose_b = true;
+  }
+  VLOG(1) << "transpose_a:" << transpose_a;
+  VLOG(1) << "transpose_b:" << transpose_b;
+  auto a_matrix_shape =
+      MaybeGetMinimumShape(a_matrix.shape(), 2, found_unknown_shapes);
+  auto b_matrix_shape =
+      MaybeGetMinimumShape(b_matrix.shape(), 2, found_unknown_shapes);
+  if (transpose_a) {
+    m_dim = a_matrix_shape.dim(1).size();
+    k_dim = a_matrix_shape.dim(0).size();
+  } else {
+    m_dim = a_matrix_shape.dim(0).size();
+    k_dim = a_matrix_shape.dim(1).size();
+  }
+  if (transpose_b) {
+    k_dim_b = b_matrix_shape.dim(1).size();
+    n_dim = b_matrix_shape.dim(0).size();
+  } else {
+    k_dim_b = b_matrix_shape.dim(0).size();
+    n_dim = b_matrix_shape.dim(1).size();
+  }
+
+  VLOG(1) << "M, N, K: " << m_dim << "," << n_dim << "," << k_dim;
+  // Only check equality when both sizes are known (in other words, when
+  // neither is set to a minimum dimension size of 1).
+  if (k_dim_b != 1 && k_dim != 1 && k_dim_b != k_dim) {
+    LOG(ERROR) << "Incompatible Matrix dimensions";
+    return ops;
+  } else {
+    // One of k_dim and k_dim_b might be 1 (mininum dimension size).
+    k_dim = std::max(k_dim, k_dim_b);
+  }
+
+  ops = m_dim * n_dim * k_dim * 2;
+  VLOG(1) << "Operations for Matmul" << ops;
+
+  if (mat_mul != nullptr) {
+    mat_mul->m = m_dim;
+    mat_mul->n = n_dim;
+    mat_mul->k = k_dim;
+  }
+  return ops;
+}
+
+// TODO(cliffy): Dedup this method and CountConv2DBackPropFilterOperations.
+int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
+    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    bool* found_unknown_shapes) const {
+  int64 ops = 0;
+
+  if (op_features.op() != kConv2dBackPropInput) {
+    LOG(ERROR) << "Invalid Operation";
+    return ops;
+  }
+
+  if (op_features.attr().find("_output_shapes") == op_features.attr().end()) {
+    // Need _output_shapes for input shape.
+    LOG(ERROR) << "No output shape in Conv2DBackPropInput op feaure.";
+    return ops;
+  }
+
+  const auto& input_shape =
+      op_features.attr().at("_output_shapes").list().shape(0);
+  ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
+      input_shape, op_features.inputs(1).shape(), op_features,
+      found_unknown_shapes);
+
+  ops = conv_dims.batch;
+  ops *= conv_dims.ox * conv_dims.oy;
+  ops *= conv_dims.kx * conv_dims.ky;
+  ops *= conv_dims.iz * conv_dims.oz;
+  ops *= kOpsPerMac;
+
+  VLOG(1) << "Operations for Conv2DBackPropInput" << ops;
+
+  if (returned_conv_dims != nullptr) {
+    *returned_conv_dims = conv_dims;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CountConv2DBackPropFilterOperations(
+    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    bool* found_unknown_shapes) const {
+  int64 ops = 0;
+  if (op_features.op() != kConv2dBackPropFilter) {
+    LOG(ERROR) << "Invalid Operation";
+    return ops;
+  }
+
+  if (op_features.attr().find("_output_shapes") == op_features.attr().end()) {
+    // Need _output_shapes for filter shape.
+    LOG(ERROR) << "No output shape in Conv2DBackPropFilter op feaure.";
+    return ops;
+  }
+
+  const auto& filter_shape =
+      op_features.attr().at("_output_shapes").list().shape(0);
+  ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
+      op_features.inputs(0).shape(), filter_shape, op_features,
+      found_unknown_shapes);
+
+  ops = conv_dims.batch;
+  ops *= conv_dims.ox * conv_dims.oy;
+  ops *= conv_dims.kx * conv_dims.ky;
+  ops *= conv_dims.iz * conv_dims.oz;
+  ops *= kOpsPerMac;
+
+  VLOG(1) << "Operations for Conv2DBackPropFilter" << ops;
+
+  if (returned_conv_dims != nullptr) {
+    *returned_conv_dims = conv_dims;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CalculateSingleInputSize(
+    const OpInfo::TensorProperties& input, bool* found_unknown_shapes) const {
+  VLOG(1) << "   with " << input.dtype() << " input of shape "
+          << input.shape().DebugString();
+  int64 input_size = 1;
+  int num_dims = std::max(1, input.shape().dim_size());
+  auto input_shape =
+      MaybeGetMinimumShape(input.shape(), num_dims, found_unknown_shapes);
+  for (const auto& dim : input_shape.dim()) {
+    input_size *= dim.size();
+  }
+  return input_size * DataTypeSize(input.dtype());
+}
+
+int64 OpLevelCostEstimator::CalculateInputSize(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  int64 total_input_size = 0;
+  for (auto& input : op_features.inputs()) {
+    int64 input_size = CalculateSingleInputSize(input, found_unknown_shapes);
+    total_input_size += input_size;
+    VLOG(1) << "Input Size: " << input_size
+            << " Total Input Size:" << total_input_size;
+  }
+  return total_input_size;
+}
+
+int64 OpLevelCostEstimator::CalculateOutputSize(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  int64 total_output_size = 0;
+  // use float as default for calculations
+  DataType dt = DT_FLOAT;
+  for (const auto& item : op_features.attr()) {
+    VLOG(1) << "Key:" << item.first
+            << " Value:" << SummarizeAttrValue(item.second);
+    if (item.first == "_output_shapes") {
+      for (const auto& original_output_shape : item.second.list().shape()) {
+        int64 output_size = 1;
+        int num_dims = std::max(1, original_output_shape.dim_size());
+        auto output_shape = MaybeGetMinimumShape(
+            original_output_shape, num_dims, found_unknown_shapes);
+        for (const auto& dim : output_shape.dim()) {
+          output_size *= dim.size();
+        }
+        output_size *= DataTypeSize(dt);
+        total_output_size += output_size;
+        VLOG(1) << "Output Size: " << output_size
+                << " Total Output Size:" << total_output_size;
+      }
+    }
+    if (item.first == "T") {
+      dt = item.second.type();
+    }
+  }
+  return total_output_size;
+}
+
+Costs OpLevelCostEstimator::PredictConv2D(const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs = PredictOpCountBasedCost(
+      CountConv2DOperations(op_features, &found_unknown_shapes), op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictConv2DBackPropInput(
+    const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs =
+      PredictOpCountBasedCost(CountConv2DBackPropInputOperations(
+                                  op_features, nullptr, &found_unknown_shapes),
+                              op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictConv2DBackPropFilter(
+    const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs =
+      PredictOpCountBasedCost(CountConv2DBackPropFilterOperations(
+                                  op_features, nullptr, &found_unknown_shapes),
+                              op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictMatMul(const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs = PredictOpCountBasedCost(
+      CountMatMulOperations(op_features, &found_unknown_shapes), op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictNoOp(const OpInfo& op_features) const {
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  return Costs::ZeroCosts();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
new file mode 100644
index 0000000000000000000000000000000000000000..266b6339225156ce9619f1b85b576235200e8c52
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -0,0 +1,142 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
+
+#include <functional>
+#include <map>
+#include <string>
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class OpLevelCostEstimator {
+ public:
+  OpLevelCostEstimator();
+  virtual ~OpLevelCostEstimator() {}
+
+  Costs PredictCosts(const OpInfo& op_features) const;
+
+ protected:
+  // Returns an estimate of device performance (in billions of operations
+  // executed per second) and memory bandwith (in GigaBytes/second) for the
+  // specified device.
+  virtual std::pair<double, double> GetDeviceInfo(
+      const DeviceProperties& device) const;
+
+  // For operations for which we haven't yet built estimates, returns a dummy
+  // value based on input size.
+  Costs DummyExecutionTime(const OpInfo& op_features) const;
+
+  // Naive cost estimate based on operations divided by device ops/sec.
+  Costs PredictOpCountBasedCost(double operations,
+                                const OpInfo& op_features) const;
+
+  // This family of routines counts the number of operations to perform the
+  // specified TensorFlow Op.
+  struct MatMulDimensions {
+    int m;
+    int n;
+    int k;
+  };
+  struct ConvolutionDimensions {
+    int64 batch;      // Batch size.
+    int64 ix;         // Input size x.
+    int64 iy;         // Input size y.
+    int64 iz;         // Input depth.
+    int64 kx;         // Kernel x.
+    int64 ky;         // Kernel y.
+    int64 oz;         // Output depth.
+    int64 ox;         // Output size x.
+    int64 oy;         // Output size y.
+    int64 sx;         // Stride x.
+    int64 sy;         // Stride y.
+    Padding padding;  // SAME or VALID.
+  };
+  int64 CountConv2DOperations(const OpInfo& op_features,
+                              bool* found_unknown_shapes) const;
+  int64 CountConv2DOperations(const OpInfo& op_features,
+                              ConvolutionDimensions* conv_info,
+                              bool* found_unknown_shapes) const;
+  int64 CountMatMulOperations(const OpInfo& op_features,
+                              bool* found_unknown_shapes) const;
+  int64 CountMatMulOperations(const OpInfo& op_features,
+                              MatMulDimensions* mat_mul,
+                              bool* found_unknown_shapes) const;
+  int64 CountConv2DBackPropInputOperations(const OpInfo& op_features,
+                                           ConvolutionDimensions* conv_info,
+                                           bool* found_unknown_shapes) const;
+  int64 CountConv2DBackPropFilterOperations(const OpInfo& op_features,
+                                            ConvolutionDimensions* conv_info,
+                                            bool* found_unknown_shapes) const;
+
+  // Calculate the total size in bytes of a single input to a TensorFlow op.
+  int64 CalculateSingleInputSize(const OpInfo::TensorProperties& input,
+                                 bool* found_unknown_shapes) const;
+
+  // Calculate the total size in bytes of the all
+  // the inputs of specified TensorFlow Op
+  int64 CalculateInputSize(const OpInfo& op_features,
+                           bool* found_unknown_shapes) const;
+
+  // Calculate the total size in bytes of the all
+  // the outputs of specified TensorFlow Op
+  int64 CalculateOutputSize(const OpInfo& op_features,
+                            bool* found_unknown_shapes) const;
+
+  // This family of routines predicts the costs to
+  // perform the specified TensorFlow Op on the
+  // device represented by a subclass. The default
+  // implementation just divides the operations to
+  // perform the op (from the "Count" routines,
+  // above) by the device peak operations per
+  // second. Override to supply a better estimate.
+  // Implementation of costs other than
+  // execution_time is optional, depending on the
+  // device.
+  Costs PredictConv2D(const OpInfo& op_features) const;
+  Costs PredictConv2DBackPropInput(const OpInfo& op_features) const;
+  Costs PredictConv2DBackPropFilter(const OpInfo& op_features) const;
+  Costs PredictMatMul(const OpInfo& op_features) const;
+  Costs PredictNoOp(const OpInfo& op_features) const;
+
+  // Utility function for safe division. Returns 0
+  // if rhs is 0 or negative.
+  static double SafeDiv(const double lhs, const double rhs) {
+    if (rhs > 0) {
+      return lhs / rhs;
+    } else {
+      return 0.0;
+    }
+  }
+
+  static ConvolutionDimensions ConvolutionDimensionsFromInputs(
+      const TensorShapeProto& original_image_shape,
+      const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+      bool* found_unknown_shapes);
+
+ protected:
+  typedef std::function<Costs(const OpInfo& op_feature)> CostImpl;
+  std::map<string, CostImpl> device_cost_impl_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffd1eac687ec70b649a5d6faa8ead5fb89d46357
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+// Wrangles the minimum number of proto fields to set up a matrix.
+void DescribeMatrix(int rows, int columns, OpInfo *op_features) {
+  auto input = op_features->add_inputs();
+  auto shape = input->mutable_shape();
+  auto shape_rows = shape->add_dim();
+  shape_rows->set_size(rows);
+  auto shape_columns = shape->add_dim();
+  shape_columns->set_size(columns);
+  input->set_dtype(DT_FLOAT);
+}
+
+// Returns an OpInfo for MatMul with the minimum set of fields set up.
+OpInfo DescribeMatMul(int m, int n, int l, int k) {
+  OpInfo op_features;
+  auto device = op_features.mutable_device();
+  device->set_type("CPU");
+  op_features.set_op("MatMul");
+
+  DescribeMatrix(m, l, &op_features);
+  DescribeMatrix(k, n, &op_features);
+  return op_features;
+}
+
+// Returns an OpInfo for MatMul with unknown input shapes.
+OpInfo DescribeMatMulUnknownShape() {
+  OpInfo op_features;
+  auto device = op_features.mutable_device();
+  device->set_type("CPU");
+  op_features.set_op("MatMul");
+
+  auto input = op_features.add_inputs();
+  auto shape = input->mutable_shape();
+  shape->set_unknown_rank(true);
+
+  input = op_features.add_inputs();
+  shape = input->mutable_shape();
+  shape->set_unknown_rank(true);
+
+  return op_features;
+}
+
+// Wrangles the minimum number of proto fields to set up a 4D Tensor for cost
+// estimation purposes.
+void DescribeTensor4D(int dim0, int dim1, int dim2, int dim3,
+                      OpInfo *op_features) {
+  auto input = op_features->add_inputs();
+  auto shape = input->mutable_shape();
+  shape->add_dim()->set_size(dim0);
+  shape->add_dim()->set_size(dim1);
+  shape->add_dim()->set_size(dim2);
+  shape->add_dim()->set_size(dim3);
+}
+
+// Returns an OpInfo for Conv2D with the minimum set of fields set up.
+OpInfo DescribeConvolution(int batch, int ix, int iy, int iz1, int iz2, int kx,
+                           int ky, int oz) {
+  OpInfo op_features;
+  auto device = op_features.mutable_device();
+  device->set_type("CPU");
+  op_features.set_op("Conv2D");
+
+  DescribeTensor4D(batch, ix, iy, iz1, &op_features);
+  DescribeTensor4D(kx, ky, iz2, oz, &op_features);
+  return op_features;
+}
+}  // namespace
+
+TEST(OpLevelCostEstimatorTest, UnknownOrPartialShape) {
+  OpLevelCostEstimator estimator;
+
+  EXPECT_EQ(false,
+            estimator.PredictCosts(DescribeMatMul(2, 4, 7, 7)).inaccurate);
+  EXPECT_EQ(true,
+            estimator.PredictCosts(DescribeMatMul(-1, 4, 7, 7)).inaccurate);
+  EXPECT_EQ(true,
+            estimator.PredictCosts(DescribeMatMul(2, 4, -1, 7)).inaccurate);
+
+  EXPECT_EQ(
+      false,
+      estimator.PredictCosts(DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256))
+          .inaccurate);
+  EXPECT_EQ(
+      true,
+      estimator.PredictCosts(DescribeConvolution(16, -1, 19, 48, 48, 5, 5, 256))
+          .inaccurate);
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_performance_data.proto b/tensorflow/core/grappler/costs/op_performance_data.proto
index a371868193f8b9fa9ab45198839e65e9aa552779..887a714c0f77cbbd43def8d4fd5a52b546375fbf 100644
--- a/tensorflow/core/grappler/costs/op_performance_data.proto
+++ b/tensorflow/core/grappler/costs/op_performance_data.proto
@@ -22,6 +22,7 @@ import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/protobuf/device_properties.proto";
 
 // Description of an operation as well as the parameters expected to impact its
 // performance.
@@ -41,36 +42,6 @@ message OpInfo {
   repeated TensorProperties inputs = 3;
 
   // Device on which the operation is run.
-  message DeviceProperties {
-    // Device type (CPU, GPU, ...)
-    string type = 1;
-    // Vendor (Intel, nvidia, ...)
-    string vendor = 2;
-    // Model (Haswell, K40, ...)
-    string model = 3;
-    // Core Frequency in Mhz
-    int64 frequency = 4;
-    // Number of cores
-    int64 num_cores = 5;
-    // Version of the tools and libraries used with this device (e.g. gcc 4.9,
-    // cudnn 5.1)
-    map<string, string> environment = 6;
-    // Number of registers per core.
-    int64 num_registers = 7;
-    // L1 cache size in bytes
-    int64 l1_cache_size = 8;
-    // L2 cache size in bytes
-    int64 l2_cache_size = 9;
-    // L3 cache size in bytes
-    int64 l3_cache_size = 10;
-    // Shared memory size per multiprocessor in bytes. This field is
-    // applicable to GPUs only.
-    int64 shared_memory_size_per_multiprocessor = 11;
-    // Memory size in bytes
-    int64 memory_size = 12;
-    // Memory bandwidth in KB/s
-    int64 bandwidth = 13;
-  }
   DeviceProperties device = 4;
 }
 
diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9866bc86887e2fa1a1fcfe95e3e9673b7df1a8f3
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -0,0 +1,152 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include <algorithm>
+#include <cmath>
+
+namespace tensorflow {
+namespace grappler {
+
+// Given a sorted vector of values, calculate the median.
+// Returns 0 for an empty vector.  Does not verify sortedness.
+static double SortedMedian(const std::vector<double> &values) {
+  const int n = values.size();
+  if (n == 0) return 0.0;
+  if (n & 1) {
+    return values[n / 2];
+  } else {
+    return (values[n / 2] + values[n / 2 - 1]) / 2.0;
+  }
+}
+
+// Given a vector of values (sorted or not), calculate the median.
+static double Median(std::vector<double> &&values) {
+  const size_t n = values.size();
+  if (n == 0) return 0;
+  const auto middle = values.begin() + (n / 2);
+  // Put the middle value in its place.
+  std::nth_element(values.begin(), middle, values.end());
+  if (n & 1) {
+    return *middle;
+  }
+  // Return the average of the two elements, the max_element lower than
+  // *middle is found between begin and middle as a post-cond of
+  // nth_element.
+  const auto lower_middle = std::max_element(values.begin(), middle);
+  // Preventing overflow. We know that '*lower_middle <= *middle'.
+  // If both are on opposite sides of zero, the sum won't overflow, otherwise
+  // the difference won't overflow.
+  if (*lower_middle <= 0 && *middle >= 0) {
+    return (*lower_middle + *middle) / 2;
+  }
+  return *lower_middle + (*middle - *lower_middle) / 2;
+}
+
+// Given a set of values, calculates the scaled Median Absolute Deviation (a
+// robust approximation to the standard deviation).  This is calculated as the
+// median of the absolute deviations from the median, scaled by 1.4826.  Its
+// advantage over the standard deviation is that it is not (as) affected by
+// outlier values.  Returns a pair<median, mad>.
+static std::pair<double, double> ScaledMedianAbsoluteDeviation(
+    const std::vector<double> &sorted_values) {
+  double median = SortedMedian(sorted_values);
+
+  // Next, we calculate the absolute deviations from the median,
+  // find the median of the resulting data, and scale by 1.4826.
+  std::vector<double> deviations;
+  deviations.reserve(sorted_values.size());
+  for (double d : sorted_values) {
+    deviations.push_back(std::abs(d - median));
+  }
+  double mad = Median(std::move(deviations)) * 1.4826;
+  return std::pair<double, double>(median, mad);
+}
+
+RobustStats::RobustStats(const std::vector<double> &values)
+    : RobustStats(std::vector<double>(values)) {}
+
+RobustStats::RobustStats(std::vector<double> &&values) {
+  std::sort(values.begin(), values.end());
+  lo_ = values[0];
+  hi_ = values.back();
+  HuberMAD(values);
+}
+
+// Computes an updated mean using Huber's weighting function (values beyond
+// the margin are weighted by margin / abs(value - mean).
+double UpdateHuberMean(const std::vector<double> &sorted_values, double mean,
+                       double margin) {
+  int num_within = 0;
+  double sum = 0.0;
+
+  for (double d : sorted_values) {
+    if (d < mean - margin) {
+      sum -= margin;
+    } else if (d > mean + margin) {
+      sum += margin;
+    } else {
+      sum += d;
+      ++num_within;
+    }
+  }
+
+  // It is possible, for a set with an interquartile distance of 0, i.e., with
+  // more than half of the values at the median, to encounter the case where
+  // the Huber mean drifts slightly off the median and there are no values
+  // within the margin.  In that case, just return the old mean, and the caller
+  // will quit.
+  if (num_within > 0) {
+    return sum / num_within;
+  } else {
+    return mean;
+  }
+}
+
+// Given a list of values, this approximates the stddev using the MAD and then
+// uses it to compute a Huber robust mean (sandwich mean).  A margin of
+// c*stddev is defined around the current mean, and values are weighted by
+// margin / abs(value - mean) if outside the margin, or 1 if inside.  This
+// computes the mean iteratively, because each time it changes the margin
+// shifts a bit.  It typically settles very quickly, but it's possible for it
+// to be unstable.  We limit it to 10 iterations.
+//
+void RobustStats::HuberMAD(const std::vector<double> &sorted_values) {
+  const std::pair<double, double> median_mad =
+      ScaledMedianAbsoluteDeviation(sorted_values);
+  mean_ = median_mad.first;
+  stddev_ = median_mad.second;
+
+  // c = 1.345 is the commonly used cutoff with 95% efficiency at the normal.
+  // We're using c = 1.5 to be a little more conservative, and because that's
+  // the default in S-plus.
+  // TODO(dehnert): Specialize Stats for integral types so we don't implement
+  // methods that don't make sense.
+  const double c = 1.5;
+  const double margin = c * stddev_;
+
+  // Iterate 10 times, or until the Huber mean stabilizes.
+  // If the margin is zero, we don't want mean to drift from the median.
+  if (margin > 0.0) {
+    for (int k = 0; k < 10; ++k) {
+      double old_mean = mean_;
+      mean_ = UpdateHuberMean(sorted_values, mean_, margin);
+      if (mean_ == old_mean) break;
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/xla/port/initialize.h b/tensorflow/core/grappler/costs/robust_stats.h
similarity index 51%
rename from tensorflow/compiler/xla/port/initialize.h
rename to tensorflow/core/grappler/costs/robust_stats.h
index 13d9632f97c72296e9a335c2a10edefa9abc0e17..9d8f5bc970ad9cde6e5c31ce0df72272e35d1662 100644
--- a/tensorflow/compiler/xla/port/initialize.h
+++ b/tensorflow/core/grappler/costs/robust_stats.h
@@ -13,27 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PORT_INITIALIZE_H_
-#define TENSORFLOW_COMPILER_XLA_PORT_INITIALIZE_H_
+#ifndef TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+#define TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
 
-#undef REGISTER_MODULE_INITIALIZER
-
-namespace xla {
-
-class Initializer {
+#include <vector>
+namespace tensorflow {
+namespace grappler {
+class RobustStats {
  public:
-  typedef void (*InitializerFunc)();
-  explicit Initializer(InitializerFunc func) { func(); }
-};
+  RobustStats(const std::vector<double>& values);
+  RobustStats(std::vector<double>&& values);
 
-}  // namespace xla
+  double lo() const { return lo_; }
+  double hi() const { return hi_; }
+  double mean() const { return mean_; }
 
-#define REGISTER_INITIALIZER(type, name, body)         \
-  static void google_init_##type##_##name() { body; }  \
-  xla::Initializer google_initializer_##type##_##name( \
-      google_init_##type##_##name)
+ private:
+  void HuberMAD(const std::vector<double>& values);
 
-#define REGISTER_MODULE_INITIALIZER(name, body) \
-  REGISTER_INITIALIZER(module, name, body)
+  double lo_;
+  double hi_;
+  double mean_;
+  double stddev_;
+};
+}  // namespace grappler
+}  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_XLA_PORT_INITIALIZE_H_
+#endif  // TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
diff --git a/tensorflow/core/grappler/costs/robust_stats_test.cc b/tensorflow/core/grappler/costs/robust_stats_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..924097b126d395f0d2bdb1285b49c9891d6c8c10
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class RobustStatsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    for (double d = 1.0; d <= 5.0; d += 1.0) {
+      values1_.push_back(5.0 - d);
+      values1_.push_back(5.0 + d);
+      values2_.push_back(25.0 - 2 * d);
+      values2_.push_back(25.0 + 2 * d);
+      values3_.push_back(-3.0 - d);
+      values3_.push_back(-3.0 + d);
+    }
+    values1_.push_back(5.0);  // Odd # elements, mean is 5.0
+    values3_.push_back(197.0);
+    values3_.push_back(-203.0);  // Even # elements, mean is -3.0
+  }
+
+  std::vector<double> values1_;
+  std::vector<double> values2_;
+  std::vector<double> values3_;
+};
+
+TEST_F(RobustStatsTest, Simple) {
+  RobustStats s1(values1_);
+  EXPECT_EQ(5.0, s1.mean());
+  EXPECT_EQ(0.0, s1.lo());
+  EXPECT_EQ(10.0, s1.hi());
+
+  RobustStats s2(values2_);
+  EXPECT_EQ(25.0, s2.mean());
+  EXPECT_EQ(15.0, s2.lo());
+  EXPECT_EQ(35.0, s2.hi());
+
+  RobustStats s3(values3_);
+  EXPECT_EQ(-3.0, s3.mean());
+  EXPECT_EQ(-203.0, s3.lo());
+  EXPECT_EQ(197.0, s3.hi());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 4e35de9d4a6b17ebb9d2eefda6bca2f769be5812..bdfb17a456b921916d5795b754560b86c1c03fb7 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
@@ -125,7 +126,7 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
   return inputs;
 }
 
-OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
+DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
   DeviceNameUtils::ParsedName parsed;
   if (DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
     if (parsed.type == "GPU") {
@@ -134,73 +135,10 @@ OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
       return GetLocalCPUInfo();
     }
   }
-  OpInfo::DeviceProperties device;
+  DeviceProperties device;
   device.set_type("UNKNOWN");
   return device;
 }
 
-OpInfo::DeviceProperties GetLocalCPUInfo() {
-  OpInfo::DeviceProperties device;
-  device.set_type("CPU");
-
-  device.set_vendor(port::CPUVendorIDString());
-  // Combine cpu family and model into the model string.
-  device.set_model(
-      strings::StrCat((port::CPUFamily() << 4) + port::CPUModelNum()));
-  device.set_frequency(port::NominalCPUFrequency());
-  device.set_num_cores(port::NumSchedulableCPUs());
-  device.set_l1_cache_size(Eigen::l1CacheSize());
-  device.set_l2_cache_size(Eigen::l2CacheSize());
-  device.set_l3_cache_size(Eigen::l3CacheSize());
-
-  (*device.mutable_environment())["cpu_instruction_set"] =
-      Eigen::SimdInstructionSetsInUse();
-
-  (*device.mutable_environment())["eigen"] = strings::StrCat(
-      EIGEN_WORLD_VERSION, ".", EIGEN_MAJOR_VERSION, ".", EIGEN_MINOR_VERSION);
-#ifdef EIGEN_USE_LIBXSMM
-  (*device.mutable_environment())["libxsmm"] = LIBXSMM_VERSION;
-#endif
-
-  return device;
-}
-
-OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id) {
-  OpInfo::DeviceProperties device;
-  device.set_type("GPU");
-
-#if GOOGLE_CUDA
-  cudaDeviceProp properties;
-  cudaError_t error = cudaGetDeviceProperties(&properties, gpu_id);
-  if (error == cudaSuccess) {
-    device.set_vendor("NVidia");
-    device.set_model(properties.name);
-    device.set_frequency(properties.clockRate / 1000);
-    device.set_num_cores(properties.multiProcessorCount);
-    device.set_num_registers(properties.regsPerMultiprocessor);
-    // For compute capability less than 5, l1 cache size is configurable to
-    // either 16 KB or 48 KB. We use the initial configuration 16 KB here. For
-    // compute capability larger or equal to 5, l1 cache (unified with texture
-    // cache) size is 24 KB. This number may need to be updated for future
-    // compute capabilities.
-    device.set_l1_cache_size((properties.major < 5) ? 16 * 1024 : 24 * 1024);
-    device.set_l2_cache_size(properties.l2CacheSize);
-    device.set_l3_cache_size(0);
-    device.set_shared_memory_size_per_multiprocessor(
-        properties.sharedMemPerMultiprocessor);
-    device.set_memory_size(properties.totalGlobalMem);
-    // 8 is the number of bits per byte. 2 is accounted for
-    // double data rate (DDR).
-    device.set_bandwidth(properties.memoryBusWidth / 8 *
-                         properties.memoryClockRate * 2);
-  }
-
-  (*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
-  (*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION);
-#endif
-
-  return device;
-}
-
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/utils.h b/tensorflow/core/grappler/costs/utils.h
index 1193c0f5da07366a317b165e609c646d224b69df..32e32a09e1693c58a9cfd93a1f03939a44237b18 100644
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -40,14 +41,7 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
     const std::unordered_map<string, const NodeDef*>& name_to_node);
 
 // Returns the DeviceProperties of the device on which 'node' runs.
-OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
-
-// Returns the DeviceProperties of the CPU on which grappler is running.
-OpInfo::DeviceProperties GetLocalCPUInfo();
-
-// Returns the DeviceProperties for the specified GPU attached to the server on
-// which grappler is running.
-OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id);
+DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_placer.cc b/tensorflow/core/grappler/costs/virtual_placer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eafa6789feb6e35983775bf70266d790ec992d4c
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_placer.cc
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+VirtualPlacer::VirtualPlacer(const Cluster* cluster) : has_gpu_(false) {
+  CHECK(cluster);
+  devices_ = cluster->GetDevices();
+  for (const auto& device : devices_) {
+    if (str_util::Lowercase(device.first).find("gpu") != string::npos) {
+      has_gpu_ = true;
+    }
+  }
+
+  unknown_device_.set_type("UNKNOWN");
+}
+
+const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
+  DeviceNameUtils::ParsedName parsed;
+  if (!node.device().empty()) {
+    auto it = devices_.find(node.device());
+    if (it != devices_.end()) {
+      return it->second;
+    }
+    if (DeviceNameUtils::ParseLocalName(node.device(), &parsed)) {
+      string device_name =
+          strings::StrCat("/job:localhost/replica:0/task:0/",
+                          str_util::Lowercase(parsed.type), ":", parsed.id);
+      it = devices_.find(device_name);
+      if (it != devices_.end()) {
+        return it->second;
+      }
+    }
+    return unknown_device_;
+  }
+  string device;
+  if (has_gpu_) {
+    device = "/job:localhost/replica:0/task:0/gpu:0";
+  } else {
+    device = "/job:localhost/replica:0/task:0/cpu:0";
+  }
+  auto it = devices_.find(device);
+  if (it == devices_.end()) {
+    return unknown_device_;
+  }
+  return it->second;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_placer.h b/tensorflow/core/grappler/costs/virtual_placer.h
new file mode 100644
index 0000000000000000000000000000000000000000..40cd64e37c1f1df62956accabb56971c82a65dac
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_placer.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
+
+#include <unordered_map>
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+class NodeDef;
+
+namespace grappler {
+class Cluster;
+
+// The virtual placer emulates the behavior of the TF placer.
+class VirtualPlacer {
+ public:
+  VirtualPlacer(const Cluster* cluster);
+
+  const DeviceProperties& get_device(const NodeDef& node) const;
+
+ private:
+  std::unordered_map<string, DeviceProperties> devices_;
+  bool has_gpu_;
+  DeviceProperties unknown_device_;
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f77d7677ace0bc5b0ab885fce4643ff918a872a
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -0,0 +1,215 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+Costs CombineCosts(const Costs& left, const Costs& right) {
+  CHECK_NE(left.max_memory, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_streaming, kMemoryUnknown);
+
+  Costs result = left;
+  result.execution_time += right.execution_time;
+  if (right.max_memory != kMemoryUnknown) {
+    result.max_memory += right.max_memory;
+  }
+  if (right.max_per_op_buffers != kMemoryUnknown) {
+    result.max_per_op_buffers =
+        std::max(left.max_per_op_buffers, right.max_per_op_buffers);
+  }
+  if (right.max_per_op_streaming != kMemoryUnknown) {
+    result.max_per_op_streaming =
+        std::max(left.max_per_op_streaming, right.max_per_op_streaming);
+  }
+  VLOG(2) << "costs execution_time=" << result.execution_time.count()
+          << " max_memory=" << result.max_memory
+          << " max_per_op_buffers=" << result.max_per_op_buffers
+          << " max_per_op_streaming=" << result.max_per_op_streaming;
+  return result;
+}
+}  // namespace
+
+VirtualScheduler::VirtualScheduler(const GraphDef& graph,
+                                   const std::vector<string>& fetch_nodes)
+    : graph_costs_(Costs::ZeroCosts()),
+      // TODO(dyoon): Use a better way than FIFO.
+      ready_nodes_(new FIFOManager()) {
+  // First, get the nodes that would run to output fetch_nodes.
+  std::vector<const NodeDef*> nodes =
+      ComputeTransitiveFanin(graph, fetch_nodes);
+
+  // TODO(dyoon): this is a bit inefficient as name_to_node is already built in
+  // ComputeTransitiveFanin().
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& node : graph.node()) {
+    name_to_node[node.name()] = &node;
+  }
+
+  // Build node_map.
+  for (const auto* node : nodes) {
+    auto& node_state = GetNodeStateOrCreateIt(node);
+    // TODO(dyoon): add SendRecv considering devices and control dependency.
+    for (const string& input : node->input()) {
+      const NodeDef* in = name_to_node[NodeName(input)];
+      CHECK(in);
+      node_state.inputs.push_back(in);
+      auto& input_node_state = GetNodeStateOrCreateIt(in);
+      input_node_state.outputs.push_back(node);
+    }
+    if (node->input().empty()) {
+      node_state.time_ready =
+          Costs::Duration();  // Node without input: ready at time 0.
+      ready_nodes_->AddNode(node);
+    }
+  }
+}
+
+const NodeDef* VirtualScheduler::GetCurrNode() const {
+  return ready_nodes_->GetCurrNode();
+}
+
+NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
+  auto it = node_map_.find(node);
+  if (it == node_map_.end()) {
+    it = node_map_.emplace(node, NodeState()).first;
+  }
+  return it->second;
+}
+
+bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
+  // Update graph_costs_ and per-op costs.
+  graph_costs_ = CombineCosts(graph_costs_, node_costs);
+  const auto* node = GetCurrNode();
+  const auto& op_name = node->op();
+
+  auto it = op_to_cost_.find(op_name);
+  if (it == op_to_cost_.end()) {
+    it = op_to_cost_.emplace(op_name, Costs::ZeroCosts()).first;
+  }
+  auto& op_cost = it->second;
+  op_cost = CombineCosts(op_cost, node_costs);
+
+  // Update node and device states.
+  auto& node_state = node_map_[node];
+  auto& device = device_[node->device()];
+  device.nodes_executed.push_back(node);
+  // Node is scheduled when the device is available AND all the inputs are
+  // ready; hence, time_scheduled is time_ready if time_ready > device curr
+  // time.
+  node_state.time_scheduled =
+      std::max(device.GetCurrTime(), node_state.time_ready);
+  // Override device curr time with the time_scheduled.
+  device.device_costs.execution_time = node_state.time_scheduled;
+  device.device_costs = CombineCosts(device.device_costs, node_costs);
+  auto curr_time = device.GetCurrTime();
+  node_state.time_finished = curr_time;
+
+  // Update device's per-op cost.
+  {
+    auto it = device.op_to_cost.find(op_name);
+    if (it == device.op_to_cost.end()) {
+      it = device.op_to_cost.emplace(op_name, Costs::ZeroCosts()).first;
+    }
+    auto& op_cost = it->second;
+    op_cost = CombineCosts(op_cost, node_costs);
+
+    VLOG(2) << "Op scheduled -- name: " << node->name()
+            << ", op: " << node->op() << ", device: " << node->device()
+            << ", ready: " << node_state.time_ready.count()
+            << ", scheduled: " << node_state.time_scheduled.count()
+            << ", finished: " << node_state.time_finished.count();
+
+    // Increment num_inputs_ready of the output nodes.
+    for (auto* output : node_state.outputs) {
+      auto& output_state = node_map_[output];
+      output_state.num_inputs_ready++;
+      if (output_state.num_inputs_ready == output_state.inputs.size()) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output);
+      }
+    }
+
+    // Increment num_outputs_executed of the input nodes.
+    for (auto* input : node_state.inputs) {
+      auto& input_state = node_map_[input];
+      input_state.num_outputs_executed++;
+      if (input_state.num_outputs_executed == input_state.outputs.size()) {
+        // All the outputs are executed; no reference to this input nodel
+        input_state.time_no_reference = curr_time;
+        // TODO(dyoon): collect device memory usage; note that this input node
+        // use device memory between time_scheduled and time_no_reference.
+      }
+    }
+  }
+
+  // Remove the current node; assume FIFO.
+  ready_nodes_->RemoveCurrNode();
+  return !ready_nodes_->Empty();  // True if not empty.
+}
+
+Costs VirtualScheduler::Summary() const {
+  // Print out basic execution summary.
+  VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
+  VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
+  VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
+  VLOG(1) << "Expected max per-op streaming buffers: "
+          << graph_costs_.max_per_op_streaming;
+
+  VLOG(1) << "Per-op execution time:";
+  for (const auto& op_cost_pair : op_to_cost_) {
+    const auto& op = op_cost_pair.first;
+    const auto& cost = op_cost_pair.second.execution_time.count();
+    if (cost) {  // Skip printing out zero-cost ops.
+      VLOG(1) << " + " << op << " : " << cost;
+    }
+  }
+
+  // Print per device summary
+  VLOG(1) << "Devices:";
+  Costs critical_path_costs = Costs::ZeroCosts();
+
+  for (const auto& device : device_) {
+    const auto& name = device.first;
+    const auto& state = device.second;
+    VLOG(1) << "Device = " << name
+            << ", num_nodes = " << state.nodes_executed.size()
+            << ", execution_time = " << state.GetCurrTime().count();
+    VLOG(1) << "Per-op execution time:";
+    for (const auto& op_cost_pair : state.op_to_cost) {
+      const auto& op = op_cost_pair.first;
+      const auto& cost = op_cost_pair.second.execution_time.count();
+      if (cost) {  // Skip printing out zero-cost ops.
+        VLOG(1) << " + " << op << " : " << cost;
+      }
+    }
+    if (critical_path_costs.execution_time <= state.GetCurrTime()) {
+      critical_path_costs = state.device_costs;
+    }
+  }
+
+  VLOG(1) << "Critical path execution time: "
+          << critical_path_costs.execution_time.count();
+  return critical_path_costs;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d437dff50ef37c13a7a210ffdd68ba5ccd57ef1
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+
+#include <list>
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+struct NodeState {
+  std::vector<const NodeDef*> inputs;
+  std::vector<const NodeDef*> outputs;
+  int num_inputs_ready;
+  int num_outputs_executed;
+  Costs::Duration time_ready;
+  Costs::Duration time_scheduled;
+  Costs::Duration time_finished;
+  Costs::Duration time_no_reference;
+
+  // Node will be ready to be executed at time_ready, scheduled at
+  // time_scheduled, and finishes execution at time_finished.
+  // Between time_scheduled and time_no_reference, the node's output tensor
+  // needs to be on the device, using up device memory.
+
+  NodeState() {
+    num_inputs_ready = 0;
+    num_outputs_executed = 0;
+    time_ready = Costs::Duration::max();
+    time_scheduled = Costs::Duration::max();
+    time_finished = Costs::Duration::max();
+    time_no_reference = Costs::Duration::max();
+  }
+};
+
+struct DeviceState {
+  std::vector<const NodeDef*> nodes_executed;
+  Costs device_costs;
+  std::map<string, Costs> op_to_cost;  // Per-op cost.
+
+  DeviceState() { device_costs = Costs::ZeroCosts(); }
+
+  Costs::Duration GetCurrTime() const { return device_costs.execution_time; }
+};
+
+// ReadyNodeManager (abstract class):
+// Keeps ready nodes and picks the best one to be scheduled.
+class ReadyNodeManager {
+ public:
+  ReadyNodeManager() {}
+  virtual ~ReadyNodeManager() {}
+  virtual void AddNode(const NodeDef* node) = 0;
+  virtual const NodeDef* GetCurrNode() const = 0;
+  virtual void RemoveCurrNode() = 0;
+  virtual bool Empty() const = 0;
+};
+
+class FIFOManager : public ReadyNodeManager {
+ public:
+  FIFOManager() : ReadyNodeManager() {}
+  ~FIFOManager() override {}
+  void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
+  const NodeDef* GetCurrNode() const override { return nodes_.front(); }
+  void RemoveCurrNode() override { nodes_.pop_front(); }
+  bool Empty() const override { return nodes_.empty(); }
+
+ private:
+  std::list<const NodeDef*> nodes_;
+};
+
+// The virtual scheduler emulates execution of nodes in a graph, considering
+// dependencies, device, etc.
+class VirtualScheduler {
+ public:
+  VirtualScheduler(const GraphDef& graph,
+                   const std::vector<string>& fetch_nodes);
+
+  const NodeDef* GetCurrNode() const;
+  bool MarkCurrNodeExecuted(const Costs& node_costs);
+
+  Costs Summary() const;
+
+ private:
+  NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
+
+  Costs graph_costs_;                   // Graph cost.
+  std::map<string, Costs> op_to_cost_;  // Per-op cost.
+  std::unique_ptr<ReadyNodeManager> ready_nodes_;
+  std::unordered_map<const NodeDef*, NodeState> node_map_;
+  std::unordered_map<string, DeviceState> device_;
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index 0cbcf4827131a7a89234a58b495752c2fecf090f..37047b2b82ada0e6c42f32b814833b832d144c94 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -12,22 +12,6 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "mobile_srcs",
-    srcs = glob(
-        [
-            "utils.*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-alias(
-    name = "android_srcs",
-    actual = ":mobile_srcs",
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "utils",
     srcs = [
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 266d74976fea86642c212b200c70bfcfcba413df..543c884ee8d9a45ee3effbc8927ede386e08ae55 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -18,6 +18,11 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+bool IsConcat(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Concat" || op == "ConcatV2";
+}
+
 bool IsDequeueOp(const NodeDef& node) {
   static const std::set<std::string> dequeue_ops = {
       "QueueDequeueManyV2", "QueueDequeueMany", "QueueDequeueV2",
@@ -30,9 +35,20 @@ bool IsPlaceholder(const NodeDef& node) {
   return op == "Placeholder" || op == "PlaceholderV2";
 }
 
+bool IsTranspose(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Transpose";
+}
+
 bool IsVariable(const NodeDef& node) {
   const auto op = node.op();
-  return op == "Variable" || op == "VariableV2";
+  return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
+         op == "VarHandleOp";
+}
+
+bool IsMerge(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Merge";
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 2f58835628d4d4e1ee2d0d8a51fda3599bdc8356..ce9e4a062860c3feafd85dc1f00fabddffbd1230 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -21,9 +21,12 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+bool IsConcat(const NodeDef& node);
 bool IsDequeueOp(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
+bool IsTranspose(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
+bool IsMerge(const NodeDef& node);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index e3b36c84123ddb02d4ffba6bb65da15d31e2baca..f88b995c89fba21cb1fa2ad0381e82583a237451 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -12,24 +12,41 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "mobile_srcs",
-    srcs = glob(
-        [
-            "*_optimizer.*",
-            "auto_parallel.*",
-            "constant_folding.*",
-            "model_pruner.*",
-            "graph_rewriter.*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+cc_library(
+    name = "static_schedule",
+    srcs = ["static_schedule.cc"],
+    hdrs = [
+        "static_schedule.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/costs:op_level_cost_estimator",
+        "//tensorflow/core/grappler/costs:virtual_placer",
+    ],
 )
 
-alias(
-    name = "android_srcs",
-    actual = ":mobile_srcs",
-    visibility = ["//tensorflow:__subpackages__"],
+cc_test(
+    name = "static_schedule_test",
+    srcs = ["static_schedule_test.cc"],
+    deps = [
+        ":static_schedule",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
 )
 
 cc_library(
@@ -170,9 +187,11 @@ cc_library(
     deps = [
         ":graph_optimizer",
         ":graph_rewriter",
+        ":static_schedule",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
     ],
 )
 
@@ -182,12 +201,13 @@ cc_test(
     deps = [
         ":memory_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
     ],
 )
 
@@ -205,11 +225,28 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
     ],
 )
 
+cc_test(
+    name = "layout_optimizer_test",
+    srcs = ["layout_optimizer_test.cc"],
+    deps = [
+        ":layout_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
 cc_library(
     name = "meta_optimizer",
     srcs = ["meta_optimizer.cc"],
@@ -227,5 +264,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.cc b/tensorflow/core/grappler/optimizers/auto_parallel.cc
index 078fb10bc9598aa067db8847f1a3c8449c614a61..d4326a022f465d8e11503b7bbae61747f8b0bb21 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.cc
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.cc
@@ -108,13 +108,9 @@ Status AutoParallel::Initialize(const GrapplerItem& item) {
                                                 "ApplyAdam",
                                                 "ApplyRMSProp",
                                                 "ApplyCenteredRMSProp"};
-  const NodeDef* dequeue_node = nullptr;
   for (int i = 0; i < graph_.node_size(); i++) {
     all_nodes_.insert(
         std::make_pair(graph_.node(i).name(), graph_.mutable_node(i)));
-    if (IsDequeueOp(graph_.node(i))) {
-      dequeue_node = graph_.mutable_node(i);
-    }
     if (apply_gradients_ops.find(graph_.node(i).op()) !=
         apply_gradients_ops.end()) {
       apply_gradients_nodes_.insert(graph_.node(i).name());
@@ -152,6 +148,14 @@ Status AutoParallel::Initialize(const GrapplerItem& item) {
   auto train_nodes = ComputeTransitiveFanin(graph_, item.fetch);
   LOG(INFO) << "Number of training nodes: " << train_nodes.size();
 
+  const NodeDef* dequeue_node;
+  for (const auto& train_node : train_nodes) {
+    if (IsDequeueOp(*train_node)) {
+      dequeue_node = train_node;
+      break;
+    }
+  }
+
   std::vector<const NodeDef*> input_nodes;
   if (dequeue_node) {
     LOG(INFO) << "Dequeue node: " << dequeue_node->name();
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 9570ec17d05bd8895a81086e92e00ecb7c5611b6..e37c4a5b36afc43e90b0b0dd6d0e07e6374b6a16 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -68,8 +69,7 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "Slice",
                                           "SquaredDifference",
                                           "Squeeze",
-                                          "Sub",
-                                          "Sum"};
+                                          "Sub"};
   return ops_format_agnostic;
 }
 
@@ -110,9 +110,9 @@ class NodeProcessor {
   }
 
  protected:
-  bool IsDimsN(NodeDef* node, int n) const {
-    if (node->attr().find("_output_shapes") != node->attr().end()) {
-      auto shape = node->attr().at("_output_shapes").list().shape(0);
+  bool IsDimsN(const NodeDef& node, int n) const {
+    if (node.attr().find("_output_shapes") != node.attr().end()) {
+      auto shape = node.attr().at("_output_shapes").list().shape(0);
       if (shape.dim_size() == n) {
         return true;
       }
@@ -120,7 +120,7 @@ class NodeProcessor {
     return false;
   }
 
-  bool IsDimsFour(NodeDef* node) const { return IsDimsN(node, 4); }
+  bool IsDimsFour(const NodeDef& node) const { return IsDimsN(node, 4); }
 
   bool IsNHWC() const {
     if (node_->attr().find("data_format") != node_->attr().end()) {
@@ -145,7 +145,7 @@ class NodeProcessor {
   }
 
   virtual bool ShouldProcess() const {
-    return IsNHWC() && IsDimsFour(node_) && HasOutputs();
+    return IsNHWC() && IsDimsFour(*node_) && HasOutputs();
   }
 
   void UpdateAttrDataFormat() {
@@ -268,6 +268,8 @@ class NodeProcessor {
     for (const auto& output : outputs) {
       string node_name_NCHWToNHWC = strings::StrCat(
           kTransposeNCHWToNHWC, "-", node_->name(), "-", output->name());
+      // TODO (yaozhang): handle the rare case where node A is connected to more
+      // than one input of node B.
       auto it = std::find_if(output->mutable_input()->begin(),
                              output->mutable_input()->end(),
                              [this](const string& input) {
@@ -341,7 +343,7 @@ class BiasAddGradProcessor : public NodeProcessor {
   bool ShouldProcess() const override {
     auto input = node_map_->GetNode(node_->input(0));
     if (input) {
-      if ((IsNHWC() && IsDimsFour(input)) || IsNodeNCHWToNHWC(input->name())) {
+      if ((IsNHWC() && IsDimsFour(*input)) || IsNodeNCHWToNHWC(input->name())) {
         return true;
       }
     }
@@ -351,13 +353,89 @@ class BiasAddGradProcessor : public NodeProcessor {
   Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
 };
 
-class Conv2DBackpropFilterProcessor : public NodeProcessor {
+class Conv2DProcessor : public NodeProcessor {
+ public:
+  Conv2DProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
+                  bool no_gemm)
+      : NodeProcessor(graph, node, node_map), no_gemm_(no_gemm) {}
+
+ protected:
+  bool ShouldProcess() const override {
+    return IsNHWC() && IsDimsFour(*node_) && HasOutputs() &&
+           (!IsGemmUsed() || no_gemm_);
+  }
+
+  TensorShapeProto GetShape(const string& input_name) const {
+    string node_name;
+    int output_pos;
+    node_name = ParseNodeName(input_name, &output_pos);
+    NodeDef* node = node_map_->GetNode(node_name);
+    if (node->attr().find("_output_shapes") != node->attr().end()) {
+      return node->attr().at("_output_shapes").list().shape(output_pos);
+    }
+    TensorShapeProto shape;
+    return shape;
+  }
+
+  bool IsStrideOne() const {
+    if (node_->attr().find("strides") != node_->attr().end()) {
+      auto list = node_->attr().at("strides").list();
+      return list.i(1) == 1 && list.i(2) == 1;
+    }
+    return false;
+  }
+
+  bool IsValidPadding() const {
+    if (node_->attr().find("padding") != node_->attr().end()) {
+      auto padding = node_->attr().at("padding").s();
+      return padding == "VALID";
+    }
+    return false;
+  }
+
+  // The logic inside this function is based on the internal implementation of
+  // Conv2D, Conv2DBackpropInput, and Conv2DBackpropFilter ops, and thus
+  // needs to be updated accordingly if the internal implementation changes.
+  bool IsGemmUsed(const TensorShapeProto& filter_shape,
+                  const TensorShapeProto& input_shape) const {
+    if (filter_shape.dim_size() == 4) {
+      if (filter_shape.dim(0).size() == 1 && filter_shape.dim(1).size() == 1 &&
+          IsStrideOne()) {
+        return true;
+      }
+    }
+    if (input_shape.dim_size() == 4 && filter_shape.dim_size() == 4) {
+      if (input_shape.dim(1).size() == filter_shape.dim(0).size() &&
+          input_shape.dim(2).size() == filter_shape.dim(1).size() &&
+          IsValidPadding()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  virtual bool IsGemmUsed() const {
+    auto filter_shape = GetShape(node_->input(1));
+    auto input_shape = GetShape(node_->input(0));
+    return IsGemmUsed(filter_shape, input_shape);
+  }
+
+  bool no_gemm_;
+};
+
+class Conv2DBackpropFilterProcessor : public Conv2DProcessor {
  public:
   Conv2DBackpropFilterProcessor(GraphDef* graph, NodeDef* node,
-                                NodeMap* node_map)
-      : NodeProcessor(graph, node, node_map) {}
+                                NodeMap* node_map, bool no_gemm)
+      : Conv2DProcessor(graph, node, node_map, no_gemm) {}
 
  protected:
+  bool IsGemmUsed() const override {
+    auto filter_shape = GetShape(node_->name());
+    auto input_shape = GetShape(node_->input(0));
+    return Conv2DProcessor::IsGemmUsed(filter_shape, input_shape);
+  }
+
   std::vector<int> GetInputPos() const override {
     std::vector<int> input_pos = {0, 2};
     return input_pos;
@@ -370,17 +448,24 @@ class Conv2DBackpropFilterProcessor : public NodeProcessor {
   void UpdateAttrShape() override {}
 };
 
-class Conv2DBackpropInputProcessor : public NodeProcessor {
+class Conv2DBackpropInputProcessor : public Conv2DProcessor {
  public:
   Conv2DBackpropInputProcessor(GraphDef* graph, NodeDef* node,
-                               NodeMap* node_map)
-      : NodeProcessor(graph, node, node_map) {}
+                               NodeMap* node_map, bool no_gemm)
+      : Conv2DProcessor(graph, node, node_map, no_gemm) {}
 
  protected:
+  bool IsGemmUsed() const override {
+    auto filter_shape = GetShape(node_->input(1));
+    auto input_shape = GetShape(node_->name());
+    return Conv2DProcessor::IsGemmUsed(filter_shape, input_shape);
+  }
+
   std::vector<int> GetInputPos() const override {
     std::vector<int> input_pos = {2};
     return input_pos;
   }
+
   Status CustomizedProcessing() override {
     NodeDef* node = node_map_->GetNode(node_->input(0));
     return UpdateAttrValue(node);
@@ -418,7 +503,7 @@ class AgnosticNodeProcessor : public NodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(node_) && HasOutputs() && IsNodeAfterNCHWToNHWC();
+    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC();
   }
 
   bool IsNodeAfterNCHWToNHWC() const {
@@ -467,7 +552,7 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
            (Is4DOperateWithND(4) || Is4DOperateWithScalar() ||
             Is4DOperateWithVector());
   }
@@ -484,10 +569,10 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
     auto input0 = node_map_->GetNode(node_->input(0));
     auto input1 = node_map_->GetNode(node_->input(1));
     if (input0 && input1) {
-      return (IsDimsFour(input0) || IsNodeNCHWToNHWC(input0->name())) &&
+      return (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
              ((n == 4)
-                  ? (IsDimsFour(input1) || IsNodeNCHWToNHWC(input1->name()))
-                  : IsDimsN(input1, n));
+                  ? (IsDimsFour(*input1) || IsNodeNCHWToNHWC(input1->name()))
+                  : IsDimsN(*input1, n));
     }
     return false;
   }
@@ -571,7 +656,7 @@ class ConcatProcessor : public AgnosticNodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
            IsAlongDimC();
   }
 
@@ -739,7 +824,7 @@ class SqueezeProcessor : public AgnosticNodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsN(node_, 2) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+    return IsDimsN(*node_, 2) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
            IsInputConvertible() && IsAlongDimHW();
   }
 
@@ -790,7 +875,7 @@ class SumProcessor : public AgnosticNodeProcessor {
   bool ShouldProcess() const override {
     auto input0 = node_map_->GetNode(node_->input(0));
     return HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           (IsDimsFour(input0) || IsNodeNCHWToNHWC(input0->name())) &&
+           (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
            IsAlongDimNHW();
   }
 
@@ -825,10 +910,21 @@ class SumProcessor : public AgnosticNodeProcessor {
   }
 };
 
+struct TuningConfig {
+  // If true, do not use the NHWC GEMM implementation. When filter size is
+  // one or filter size is equal to input image size,
+  // the NHWC implementation of Conv2D, Conv2DBackpropInput, and
+  // Conv2DBackpropFilter will use a specialized GEMM implementation, which is
+  // usually faster than the NCHW implementation. The downside is that this
+  // might result in more non-cancellable layout conversion nodes (implemented
+  // by the Tranpose op).
+  bool no_gemm;
+};
+
 class DataLayoutOptimizer {
  public:
-  explicit DataLayoutOptimizer(GraphDef* graph)
-      : graph_(graph), node_map_(graph_) {}
+  explicit DataLayoutOptimizer(GraphDef* graph, TuningConfig config)
+      : graph_(graph), node_map_(graph_), config_(config) {}
 
   Status Optimize() {
     LOG(INFO) << "Number of nodes for original graph: " << graph_->node_size();
@@ -908,12 +1004,15 @@ class DataLayoutOptimizer {
         } else if (node->op().compare("BiasAddGrad") == 0) {
           node_processor.reset(
               new BiasAddGradProcessor(graph_, node, &node_map_));
-        } else if (node->op().compare("Conv2DBackpropFilter") == 0) {
+        } else if (node->op().compare("Conv2D") == 0) {
           node_processor.reset(
-              new Conv2DBackpropFilterProcessor(graph_, node, &node_map_));
+              new Conv2DProcessor(graph_, node, &node_map_, config_.no_gemm));
+        } else if (node->op().compare("Conv2DBackpropFilter") == 0) {
+          node_processor.reset(new Conv2DBackpropFilterProcessor(
+              graph_, node, &node_map_, config_.no_gemm));
         } else if (node->op().compare("Conv2DBackpropInput") == 0) {
-          node_processor.reset(
-              new Conv2DBackpropInputProcessor(graph_, node, &node_map_));
+          node_processor.reset(new Conv2DBackpropInputProcessor(
+              graph_, node, &node_map_, config_.no_gemm));
         } else if (node->op().compare("FusedBatchNormGrad") == 0) {
           node_processor.reset(
               new FusedBatchNormGradProcessor(graph_, node, &node_map_));
@@ -1025,17 +1124,46 @@ class DataLayoutOptimizer {
 
   GraphDef* graph_;
   NodeMap node_map_;
+  TuningConfig config_;
 };
 
+int GetNumTranspose(const GraphDef& graph) {
+  int number = 0;
+  for (const auto& node : graph.node()) {
+    if (IsTranspose(node)) {
+      number++;
+    }
+  }
+  LOG(INFO) << "Number of Transpose nodes: " << number;
+  return number;
+}
+
 Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
-  if (GetNumAvailableGPUs() < 1) {
+  if (num_gpus_ == 0) {
+    num_gpus_ = GetNumAvailableGPUs();
+  }
+  if (num_gpus_ < 1) {
     // LayoutOptimizer is currently only tuned for GPU.
     return Status::OK();
   }
+
   *output = item.graph;
-  DataLayoutOptimizer layout_optimizer(output);
+  TuningConfig config;
+  config.no_gemm = false;
+  DataLayoutOptimizer layout_optimizer(output, config);
   auto status = layout_optimizer.Optimize();
+
+  // This is based on an empirical observation that if the introduced Transpose
+  // nodes is more than 30, not using GEMM implementation would result in better
+  // performance.
+  if (status.ok() && GetNumTranspose(*output) > 30) {
+    *output = item.graph;
+    config.no_gemm = true;
+    DataLayoutOptimizer layout_optimizer(output, config);
+    status = layout_optimizer.Optimize();
+  }
+
   if (!status.ok()) {
     *output = item.graph;
   }
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.h b/tensorflow/core/grappler/optimizers/layout_optimizer.h
index 66dec17a35c125dca9dfe3a2c7f483e4fcd650ad..1bd6f9544b1da87fc86201aef67f151cd06c7124 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.h
@@ -29,11 +29,17 @@ class LayoutOptimizer : public GraphOptimizer {
 
   string name() const override { return "layout"; };
 
+  // This is for testing only.
+  void set_num_gpus(int num_gpus) { num_gpus_ = num_gpus; };
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* output) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
+
+ private:
+  int num_gpus_ = 0;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be38ca1a69e7360d5d9fa582b0492f9ea48eae14
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+void AddOutputShape(Node* node, const TensorShape& shape) {
+  std::vector<TensorShapeProto> output_shapes;
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+  output_shapes.push_back(shape_proto);
+  node->AddAttr("_output_shapes", output_shapes);
+}
+
+class LayoutOptimizerTest : public ::testing::Test {
+ protected:
+  Output SimpleConv(tensorflow::Scope* s, int input_size, int filter_size,
+                    const string& padding) {
+    int batch_size = 128;
+    int input_height = input_size;
+    int input_width = input_size;
+    int input_depth = 3;
+    int filter_count = 2;
+    int stride = 1;
+    TensorShape input_shape(
+        {batch_size, input_height, input_width, input_depth});
+    Tensor input_data(DT_FLOAT, input_shape);
+    test::FillIota<float>(&input_data, 1.0f);
+    Output input =
+        ops::Const(s->WithOpName("Input"), Input::Initializer(input_data));
+    AddOutputShape(input.node(), input_shape);
+
+    TensorShape filter_shape(
+        {filter_size, filter_size, input_depth, filter_count});
+    Tensor filter_data(DT_FLOAT, filter_shape);
+    test::FillIota<float>(&filter_data, 1.0f);
+    Output filter =
+        ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
+    AddOutputShape(filter.node(), filter_shape);
+
+    Output conv = ops::Conv2D(s->WithOpName("Conv2D"), input, filter,
+                              {1, stride, stride, 1}, padding);
+    AddOutputShape(conv.node(), input_shape);
+    return conv;
+  }
+};
+
+TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 1, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 1, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 2, "VALID");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 2, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_TRUE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 3, "VALID");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_TRUE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index faf2ce153a9aac747a38c474d07a7274b770d761..1ed7cab4abfdc5281f3906780527eb06e6f93f03 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -20,8 +20,10 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
+#include "tensorflow/core/grappler/optimizers/static_schedule.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
@@ -101,26 +103,178 @@ std::pair<NodeDef*, NodeDef*> BuildSwapPair(NodeDef* node, int input_to_swap,
   return std::make_pair(swap_out_node, swap_in_node);
 }
 
+static int64 EstimateSize(const OpInfo::TensorProperties& t) {
+  DataType dtype = t.dtype();
+  int64 size = DataTypeSize(dtype);
+  TensorShapeProto shape = t.shape();
+  if (shape.unknown_rank()) {
+    // Can't infer the size if the rank is unknown. It has to be at least a
+    // scalar though.
+    return size;
+  }
+  // If one of the dimensions is unknown statically, assume it's at least one.
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    if (shape.dim(i).size() < 0) {
+      shape.mutable_dim(i)->set_size(1);
+    }
+  }
+  int64 num_elems = TensorShape(shape).num_elements();
+  return num_elems * size;
+}
+
+struct SwapInfo {
+  std::vector<int> inputs_to_swap;
+  Costs::NanoSeconds time_to_swap = 0;
+};
+
+static const NodeDef* FindSwapTrigger(
+    const NodeDef* node, const SwapInfo& swap_info,
+    const std::unordered_map<string, const NodeDef*>& name_map,
+    const std::unordered_map<const NodeDef*, Costs::NanoSeconds>&
+        execution_times) {
+  // max_trigger_time stores the time before which the swap operation needs to
+  // be started in order to load the data back onto the accelerator without
+  // delaying the downstream computation.
+  Costs::NanoSeconds max_trigger_time(0);
+  std::set<string> possible_inputs;
+  for (int i = 0; i < node->input_size(); ++i) {
+    const string input_node_name = NodeName(node->input(i));
+    auto it1 = name_map.find(input_node_name);
+    if (it1 == name_map.end()) {
+      return nullptr;
+    }
+    const NodeDef* input_node = it1->second;
+
+    auto it2 = execution_times.find(input_node);
+    if (it2 == execution_times.end()) {
+      return nullptr;
+    }
+    max_trigger_time = std::max(max_trigger_time, it2->second);
+    possible_inputs.insert(input_node_name);
+  }
+
+  for (const int i : swap_info.inputs_to_swap) {
+    const string input_node_name = NodeName(node->input(i));
+    possible_inputs.erase(input_node_name);
+  }
+  if (possible_inputs.empty()) {
+    return nullptr;
+  }
+
+  max_trigger_time -= swap_info.time_to_swap;
+
+  std::map<Costs::NanoSeconds, const NodeDef*> candidates;
+  while (!possible_inputs.empty()) {
+    const string input_node_name = *possible_inputs.begin();
+    possible_inputs.erase(possible_inputs.begin());
+    auto it1 = name_map.find(input_node_name);
+    if (it1 == name_map.end()) {
+      return nullptr;
+    }
+    const NodeDef* input_node = it1->second;
+    // Don't jump over frames, since adding a control dependency from one frame
+    // to the next isn't supported. Don't go through branches, since we don't
+    // know whether they'll be executed or not.
+    if (input_node->op() == "NextIteration" || input_node->op() == "Switch" ||
+        input_node->op() == "Merge") {
+      continue;
+    }
+    auto it2 = execution_times.find(input_node);
+    if (it2 == execution_times.end()) {
+      return nullptr;
+    }
+    if (it2->second < max_trigger_time) {
+      candidates[it2->second] = input_node;
+    } else {
+      for (const string& fanin : input_node->input()) {
+        possible_inputs.insert(NodeName(fanin));
+      }
+    }
+  }
+
+  // Select the candidate that will execute last, since we want to swap the data
+  // back at the last minute while still allowing enough time for data to be
+  // swapped back timely to feed the downstream nodes.
+  if (!candidates.empty()) {
+    return candidates.rbegin()->second;
+  }
+  return nullptr;
+}
+
 Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
 
+  // Figure out what needs to be swapped;
+  std::unordered_map<NodeDef*, SwapInfo> nodes_to_swap;
   for (auto& node : *optimized_graph->mutable_node()) {
-    if (node.attr().count("swap_to_host") == 0) {
-      continue;
+    if (node.attr().count("_swap_to_host") != 0) {
+      SwapInfo& swap_info = nodes_to_swap[&node];
+      const AttrValue& val = node.attr().at("_swap_to_host");
+      if (val.has_list()) {
+        for (int64 input_id : val.list().i()) {
+          swap_info.inputs_to_swap.push_back(input_id);
+        }
+      } else {
+        int64 input_id = val.i();
+        swap_info.inputs_to_swap.push_back(input_id);
+      }
     }
+  }
+  if (nodes_to_swap.empty()) {
+    // Nothing to do.
+    return Status::OK();
+  }
 
+  {
+    // Estimate the size of the data to swap for each node.
+    GraphProperties properties(item);
+    TF_RETURN_IF_ERROR(properties.InferStatically());
+    for (auto& swap : nodes_to_swap) {
+      const NodeDef* node = swap.first;
+      std::vector<OpInfo::TensorProperties> props =
+          properties.GetInputProperties(node->name());
+      SwapInfo& swap_info = swap.second;
+      int64 bytes_to_swap = 0;
+      for (int64 input_id : swap_info.inputs_to_swap) {
+        const OpInfo::TensorProperties& t = props[input_id];
+        bytes_to_swap += EstimateSize(t);
+      }
+      // Let's assume we're going to swap over PCIe running at 16 GBps.
+      swap_info.time_to_swap = bytes_to_swap / 16;
+    }
+  }
+
+  std::unordered_map<const NodeDef*, Costs::NanoSeconds> execution_times;
+  TF_RETURN_IF_ERROR(
+      EstimateEarliestExecutionTimes(item, cluster, &execution_times));
+
+  std::unordered_map<string, const NodeDef*> name_map;
+  for (const auto& node : item.graph.node()) {
+    name_map[node.name()] = &node;
+  }
+
+  for (auto& swap : nodes_to_swap) {
+    NodeDef* node = swap.first;
+    SwapInfo& swap_info = swap.second;
+
+    // Make sure the tensor isn't swapped back in right away: look for node that
+    // will execute just before we need to swap the data back, and add a control
+    // dependency from that node to the swap node.
+    const NodeDef* trigger =
+        FindSwapTrigger(node, swap_info, name_map, execution_times);
+    if (!trigger) {
+      continue;
+    }
     // Swap all the tensors that are marked with the 'swap_to_host' attribute.
-    for (int input_id : node.attr().at("swap_to_host").list().i()) {
+    for (int input_id : swap_info.inputs_to_swap) {
       std::pair<NodeDef*, NodeDef*> swap_nodes =
-          BuildSwapPair(&node, input_id, optimized_graph);
-      *swap_nodes.first->add_input() = node.input(input_id);
-      *node.mutable_input(input_id) = swap_nodes.second->name();
-
-      // TODO(bsteiner): Make sure the tensor isn't swapped back in right away
-      // by adding a control dependency to delay the execution of the swap.
-      // string trigger;
-      //*swap_nodes.second->add_input() = strings::StrCat("^", trigger);
+          BuildSwapPair(node, input_id, optimized_graph);
+      *swap_nodes.first->add_input() = node->input(input_id);
+      *node->mutable_input(input_id) = swap_nodes.second->name();
+
+      // Add the control dependency needed to delay the execution of the swap.
+      *swap_nodes.second->add_input() = strings::StrCat("^", trigger->name());
     }
   }
 
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index c230a40fe5404b9f3aeeb953ae0aa7c33d977f9c..a4f8e22e1d8306ac2f1499cf8031e8fc669d8855 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -114,7 +115,19 @@ TEST_F(RecomputeSubgraphTest, MultiNode) {
   EXPECT_EQ("^BN1Grad", recomputed_c->input(1).substr(0, 8));
 }
 
-class MemoryOptimizerTest : public ::testing::Test {};
+class MemoryOptimizerTest : public ::testing::Test {
+ public:
+  static VirtualCluster CreateVirtualCluster() {
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_frequency(1000);
+    cpu_device.set_num_cores(4);
+    cpu_device.set_bandwidth(32);
+    std::unordered_map<string, DeviceProperties> devices;
+    devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+    return VirtualCluster(devices);
+  }
+};
 
 TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   // Build a simple graph with an op that's marked for swapping.
@@ -132,12 +145,14 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   EXPECT_EQ(5, item.graph.node_size());
   EXPECT_EQ(NodeName(e.name()), item.graph.node(4).name());
   AttrValue& val =
-      (*item.graph.mutable_node(4)->mutable_attr())["swap_to_host"];
+      (*item.graph.mutable_node(4)->mutable_attr())["_swap_to_host"];
   val.mutable_list()->add_i(0);
 
+  VirtualCluster cluster(CreateVirtualCluster());
+
   MemoryOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(&cluster, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(7, output.node_size());
@@ -156,6 +171,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
 
   EXPECT_EQ(NodeName(b.name()), swap_out.input(0));
   EXPECT_EQ(NodeName(swap_out.name()), swap_in.input(0));
+  EXPECT_EQ("^c", swap_in.input(1));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 2ea5adffebcbd4ef083b86e29c99c93f5d34b05d..8bb7800df4e204c420e15898bc04ac941b8fbdeb 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -90,15 +91,16 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   bool already_optimized = false;
   for (const auto& optimizer : optimizers) {
     if (!already_optimized) {
-      TF_RETURN_IF_ERROR(optimizer->Optimize(nullptr, item, optimized_graph));
+      TF_RETURN_IF_ERROR(optimizer->Optimize(cluster, item, optimized_graph));
       already_optimized = true;
     } else {
       GrapplerItem optimized_item = item;
       optimized_item.graph = *optimized_graph;
       TF_RETURN_IF_ERROR(
-          optimizer->Optimize(nullptr, optimized_item, optimized_graph));
+          optimizer->Optimize(cluster, optimized_item, optimized_graph));
     }
   }
+  TopologicalSort(optimized_graph);
   // Copy the graph version.
   *optimized_graph->mutable_versions() = item.graph.versions();
 
@@ -116,9 +118,9 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
 }
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
-                        GraphDef* optimized_graph) {
+                        Cluster* cluster, GraphDef* optimized_graph) {
   MetaOptimizer optimizer(cfg);
-  return optimizer.Optimize(nullptr, item, optimized_graph);
+  return optimizer.Optimize(cluster, item, optimized_graph);
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 9def2cd711f96cb2e04c36f749a724eff863bb3c..6b950c973d9a2db04675aeee26e5f70e0371f400 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -46,7 +46,7 @@ class MetaOptimizer : public GraphOptimizer {
 bool MetaOptimizerEnabled(const RewriterConfig& cfg);
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
-                        GraphDef* optimized_graph);
+                        Cluster* cluster, GraphDef* optimized_graph);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e31499eac66a9ecf350a2de6fc15b68662499854
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/static_schedule.h"
+#include <deque>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace grappler {
+
+static Costs::NanoSeconds PredictExecutionTime(
+    const GraphProperties& properties, const OpLevelCostEstimator& estimator,
+    const VirtualPlacer& placer, const NodeDef& node) {
+  OpInfo op_features;
+  op_features.set_op(node.op());
+  *op_features.mutable_attr() = node.attr();
+
+  std::vector<OpInfo::TensorProperties> inputs =
+      properties.GetInputProperties(node.name());
+  for (auto& input : inputs) {
+    op_features.add_inputs()->Swap(&input);
+  }
+
+  DeviceProperties device = placer.get_device(node);
+  op_features.mutable_device()->Swap(&device);
+
+  Costs::NanoSeconds estimate =
+      estimator.PredictCosts(op_features).execution_time;
+
+  // Make sure our estimates are at least one nanosecond per node.
+  return std::max(estimate, Costs::NanoSeconds(1));
+}
+
+Status EstimateEarliestExecutionTimes(
+    const GrapplerItem& item, const Cluster* cluster,
+    std::unordered_map<const NodeDef*, Costs::NanoSeconds>* completion_times) {
+  std::unordered_map<string, const NodeDef*> name_map;
+  std::unordered_map<const NodeDef*, int> pending_inputs;
+  std::deque<const NodeDef*> ready_nodes;
+  for (const NodeDef& node : item.graph.node()) {
+    name_map[node.name()] = &node;
+    if (node.input_size() == 0) {
+      ready_nodes.push_back(&node);
+      (*completion_times)[&node] = 0;
+    } else if (IsMerge(node)) {
+      // Merge nodes are processed as soon as one of the input becomes
+      // available.
+      pending_inputs[&node] = 1;
+    } else {
+      pending_inputs[&node] = node.input_size();
+    }
+  }
+
+  std::unordered_map<const NodeDef*, std::vector<const NodeDef*>> fanouts;
+  for (const NodeDef& node : item.graph.node()) {
+    for (const string& input : node.input()) {
+      string node_name = NodeName(input);
+      auto it = name_map.find(node_name);
+      if (it == name_map.end()) {
+        return errors::InvalidArgument(
+            strings::StrCat("Unknown input node ", input));
+      }
+      const NodeDef* fanin = it->second;
+      fanouts[fanin].push_back(&node);
+    }
+  }
+  name_map.clear();
+
+  GraphProperties properties(item);
+  TF_RETURN_IF_ERROR(properties.InferStatically());
+  OpLevelCostEstimator estimator;
+  VirtualPlacer placer(cluster);
+
+  while (!ready_nodes.empty()) {
+    const NodeDef* node = ready_nodes.front();
+    ready_nodes.pop_front();
+
+    Costs::NanoSeconds execution_time =
+        PredictExecutionTime(properties, estimator, placer, *node);
+    Costs::NanoSeconds completion_time =
+        execution_time + (*completion_times)[node];
+    (*completion_times)[node] = completion_time;
+
+    for (const NodeDef* fanout : fanouts[node]) {
+      int pending = pending_inputs[fanout];
+      if (pending == 0) {
+        // Already processed. Avoid going through loops more than once.
+        continue;
+      } else if (pending == 1) {
+        ready_nodes.push_back(fanout);
+      }
+      pending_inputs[fanout]--;
+
+      Costs::NanoSeconds ready_time =
+          std::max(completion_time, (*completion_times)[fanout]);
+      (*completion_times)[fanout] = ready_time;
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.h b/tensorflow/core/grappler/optimizers/static_schedule.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dd82b0dab1248a1b99e952d2825acb90a13b0bb
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/static_schedule.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Compute the earliest time as which the execution of each node in the graph
+// can complete.
+// In our estimation, we ensure that each node takes at least one nanosecond to
+// execute: therefore the execution times can be used to derive a topological
+// ordering of the graph (at least as long as there is no loop in the graph).
+Status EstimateEarliestExecutionTimes(
+    const GrapplerItem& item, const Cluster* cluster,
+    std::unordered_map<const NodeDef*, Costs::NanoSeconds>* execution_times);
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f53feaca4c396f3927689c135cfa6fcb4d578154
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/static_schedule.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class StaticScheduleTest : public ::testing::Test {
+ public:
+  VirtualCluster CreateVirtualCluster() const {
+    // Invent a CPU so that predictions remain the same from machine to machine.
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_frequency(1000);
+    cpu_device.set_num_cores(4);
+    cpu_device.set_bandwidth(32);
+    cpu_device.set_l1_cache_size(32 * 1024);
+    cpu_device.set_l2_cache_size(256 * 1024);
+    cpu_device.set_l3_cache_size(4 * 1024 * 1024);
+    std::unordered_map<string, DeviceProperties> devices;
+    devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+    return VirtualCluster(devices);
+  }
+};
+
+TEST_F(StaticScheduleTest, BasicGraph) {
+  // This trivial graph is so basic there's nothing to prune.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  VirtualCluster cluster(CreateVirtualCluster());
+
+  std::unordered_map<const NodeDef*, Costs::NanoSeconds> completion_times;
+  Status status =
+      EstimateEarliestExecutionTimes(item, &cluster, &completion_times);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size(), completion_times.size());
+
+  for (auto time : completion_times) {
+    if (time.first->name() == "Const/Const") {
+      EXPECT_EQ(Costs::NanoSeconds(1), time.second);
+    } else if (time.first->name() == "x") {
+      EXPECT_EQ(Costs::NanoSeconds(250001), time.second);
+    } else if (time.first->name() == "AddN") {
+      EXPECT_EQ(Costs::NanoSeconds(1500001), time.second);
+    } else if (time.first->name() == "AddN_1") {
+      EXPECT_EQ(Costs::NanoSeconds(2750001), time.second);
+    } else if (time.first->name() == "AddN_2") {
+      EXPECT_EQ(Costs::NanoSeconds(4000001), time.second);
+    } else if (time.first->name() == "AddN_3") {
+      EXPECT_EQ(Costs::NanoSeconds(5250001), time.second);
+    } else if (time.first->name() == "y") {
+      EXPECT_EQ(Costs::NanoSeconds(6500001), time.second);
+    }
+  }
+}
+
+TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
+  // Build a simple graph with a control dependency.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::Identity(s.WithOpName("d"), c);
+  Output e = ops::AddN(s.WithOpName("e"), {d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Add a control dependency between c and e.
+  EXPECT_EQ("c", item.graph.node(2).name());
+  EXPECT_EQ("e", item.graph.node(4).name());
+  *item.graph.mutable_node(4)->add_input() = "^c";
+
+  VirtualCluster cluster(CreateVirtualCluster());
+
+  std::unordered_map<const NodeDef*, Costs::NanoSeconds> completion_times;
+  Status status =
+      EstimateEarliestExecutionTimes(item, &cluster, &completion_times);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size(), completion_times.size());
+
+  for (auto time : completion_times) {
+    if (time.first->name() == "a") {
+      EXPECT_EQ(Costs::NanoSeconds(1), time.second);
+    } else if (time.first->name() == "b") {
+      EXPECT_EQ(Costs::NanoSeconds(12500001), time.second);
+    } else if (time.first->name() == "c") {
+      EXPECT_EQ(Costs::NanoSeconds(12500002), time.second);
+    } else if (time.first->name() == "d") {
+      EXPECT_EQ(Costs::NanoSeconds(12500003), time.second);
+    } else if (time.first->name() == "e") {
+      EXPECT_EQ(Costs::NanoSeconds(25000003), time.second);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e1db1a8cd29633c4f8054a159e955606e58e2a10
--- /dev/null
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -0,0 +1,64 @@
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "scc",
+    srcs = ["scc.cc"],
+    hdrs = ["scc.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_test(
+    name = "scc_test",
+    srcs = ["scc_test.cc"],
+    deps = [
+        ":scc",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "topological_sort",
+    srcs = ["topological_sort.cc"],
+    hdrs = ["topological_sort.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_test(
+    name = "topological_sort_test",
+    srcs = ["topological_sort_test.cc"],
+    deps = [
+        ":topological_sort",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/scc.cc b/tensorflow/core/grappler/utils/scc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6568e99aa3f6c1c690d689653f0cd9fb16f82673
--- /dev/null
+++ b/tensorflow/core/grappler/utils/scc.cc
@@ -0,0 +1,176 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/scc.h"
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Data structure used to store data for Tarjan's Strongly Connected
+// Components algorithm.
+struct SCCNodeData {
+  SCCNodeData()
+      : node(nullptr),
+        index(-1),
+        lowlink(-1),
+        onstack(false),
+        caller(nullptr),
+        caller_loop_location(-1) {}
+  void ResetStack(int new_index, SCCNodeData* new_caller) {
+    index = new_index;
+    lowlink = new_index;
+    onstack = true;
+    caller = new_caller;
+    caller_loop_location = 0;
+  }
+  const NodeDef* node;
+  int index;
+  int lowlink;
+  bool onstack;
+  std::vector<SCCNodeData*> children;
+  // StrongConnect "call stack" storage.
+  SCCNodeData* caller;       // Node calling StrongConnect
+  int caller_loop_location;  // Index in parent StrongConnect for loop
+};
+
+// Core DFS step of Tarjan's Strongly Connected Component algorithm
+// (implemented using iteration instead of recursion).
+void StrongConnect(SCCNodeData* v, std::stack<SCCNodeData*>* stack, int* index,
+                   std::unordered_map<const NodeDef*, int>* components,
+                   int* scc_index) {
+  // Iterative version of Tarjan's StrongConnect function.
+  // The "call stack" state is composed of a SCCNodeData's caller and
+  // caller_loop_location properties.
+  v->ResetStack(*index /* index */, nullptr /* caller */);
+  ++*index;
+  stack->push(v);
+
+  // No one put v on a StrongConnect call stack, reset caller values.
+  v->caller = nullptr;
+  v->caller_loop_location = 0;
+
+  SCCNodeData* last = v;
+  while (true) {
+    if (last->caller_loop_location < last->children.size()) {
+      // Recursive equivalent: Looping over the children of v (possibly
+      // continuing at v->caller_loop_location after having finished a
+      // recursive call.
+      SCCNodeData* w = last->children[last->caller_loop_location];
+      ++(last->caller_loop_location);  // For loop iterator increment
+      if (w->index == -1) {
+        w->ResetStack(*index /* index */, last /* caller */);
+        ++*index;
+        stack->push(w);
+        last = w;
+      } else if (w->onstack == true) {
+        last->lowlink = std::min(last->lowlink, w->index);
+      }
+    } else {
+      // At the end of v's children
+      if (last->lowlink == last->index) {
+        // v is the root of a strongly connected component
+        SCCNodeData* top;
+        while (true) {
+          top = stack->top();
+          stack->pop();
+          top->onstack = false;
+          (*components)[top->node] = *scc_index;
+          if (top == last) {
+            break;
+          }
+        }
+        ++*scc_index;
+      }
+
+      // Go up the recursive call stack
+      SCCNodeData* next_last = last->caller;
+      if (next_last == nullptr) {
+        // All nodes have been seen; finished.
+        break;
+      } else {
+        next_last->lowlink = std::min(next_last->lowlink, last->lowlink);
+        last = next_last;
+      }
+    }
+  }
+}
+
+// This is an implementation of Tarjan's Strongly Connected Components
+// DFS algorithm.  Most of the hard work is done in the function
+// StrongConnect, which is an iterative reimplementation of the
+// recursive version described here:
+//   https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
+//
+// The edges for the purpose of this algorithm are directed from input
+// to op (the reverse of the declarations of the NodeDef, which
+// contain in-edges)
+void StronglyConnectedComponents(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* components,
+    int* num_components) {
+  std::stack<SCCNodeData*> stack;
+  std::unordered_map<string, SCCNodeData*> name_to_data;
+  std::vector<SCCNodeData> node_data_container;
+  node_data_container.reserve(graph.node_size());
+  std::unordered_map<const NodeDef*, SCCNodeData*> node_to_data;
+
+  for (const NodeDef& node : graph.node()) {
+    SCCNodeData node_data;
+    node_data.node = &node;
+    node_data_container.push_back(node_data);
+    name_to_data[node.name()] = &(*node_data_container.rbegin());
+    node_to_data[&node] = &(*node_data_container.rbegin());
+  }
+
+  // Create a list of top-level parents (add them to object queue)
+  // Also create a mapping from nodes to their children.
+  for (const NodeDef& node : graph.node()) {
+    for (const string& input : node.input()) {
+      name_to_data[NodeName(input)]->children.push_back(node_to_data[&node]);
+    }
+  }
+
+  components->clear();
+  *num_components = 0;
+  int index = 0;
+  for (auto& v : node_data_container) {
+    if (v.index == -1) {
+      // Node has not yet been visited.  Start a DFS at v.
+      StrongConnect(&v, &stack, &index, components, num_components);
+    }
+  }
+
+  std::vector<int> counts_per_component(*num_components, 0);
+  for (auto& component : *components) {
+    DCHECK(component.second >= 0);
+    DCHECK(component.second < *num_components);
+    counts_per_component[component.second]++;
+  }
+  for (auto& component : *components) {
+    if (counts_per_component[component.second] == 1) {
+      component.second = -1;
+      (*num_components)--;
+    }
+  }
+  (*num_components) += 1;
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/scc.h b/tensorflow/core/grappler/utils/scc.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b0577763d66817e99eb62e9f517b12bd07aea79
--- /dev/null
+++ b/tensorflow/core/grappler/utils/scc.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+
+#include <unordered_map>
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Compute modified strongly connected components:
+// All nodes that are not part of a loop are assigned the special -1 id
+// All nodes that are part of at least one loop are assigned a positive
+// component id: if 2 nodes v and w are reachable from one another (i.e. if they
+// belong to the same scc), they'll be assigned the same id, otherwise they'll
+// be assigned distinct ids. Returns the number of distinct ids.
+void StronglyConnectedComponents(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* components,
+    int* num_ids);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
diff --git a/tensorflow/core/grappler/utils/scc_test.cc b/tensorflow/core/grappler/utils/scc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3185cbe232631d70e8a79b31168ca39b53e62272
--- /dev/null
+++ b/tensorflow/core/grappler/utils/scc_test.cc
@@ -0,0 +1,410 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/scc.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class SCCTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    std::unordered_map<string, DeviceProperties> devices;
+    DeviceProperties unknown_device;
+    devices["MY_DEVICE"] = unknown_device;
+    cluster_.reset(new VirtualCluster(devices));
+    TF_CHECK_OK(cluster_->Provision());
+  }
+
+  void TearDown() override { cluster_.reset(); }
+
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            gtl::ArraySlice<string> inputs) {
+    NodeDef node;
+    node.set_name(name);
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+
+  std::unique_ptr<VirtualCluster> cluster_;
+};
+
+TEST_F(SCCTest, NoLoops) {
+  // Create a simple graph without any loop.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  std::unordered_map<const NodeDef*, int> components;
+  int num_components;
+  StronglyConnectedComponents(item.graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 1);
+  for (const auto& node : item.graph.node()) {
+    EXPECT_EQ(-1, components[&node]);
+  }
+}
+
+TEST_F(SCCTest, DisjointCycleAndPath) {
+  GraphDef graph;
+  // Create a cycle
+  *graph.add_node() = CreateNode("a", {"d"});
+  *graph.add_node() = CreateNode("b", {"a"});
+  *graph.add_node() = CreateNode("c", {"b"});
+  *graph.add_node() = CreateNode("d", {"c"});
+
+  // Add a path disjoint from cycle
+  *graph.add_node() = CreateNode("e", {});
+  *graph.add_node() = CreateNode("f", {"e"});
+  *graph.add_node() = CreateNode("g", {"f"});
+  *graph.add_node() = CreateNode("h", {"g"});
+
+  std::vector<const NodeDef*> nodes;
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& n : graph.node()) {
+    nodes.push_back(&n);
+    name_to_node[n.name()] = &n;
+  }
+
+  int num_components;
+  std::unordered_map<const NodeDef*, int> components;
+  StronglyConnectedComponents(graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 2);
+
+  for (const auto& pair : {std::make_pair("a", "b"), std::make_pair("a", "c"),
+                           std::make_pair("a", "d")}) {
+    EXPECT_EQ(components[name_to_node[pair.first]],
+              components[name_to_node[pair.second]]);
+  }
+
+  for (const auto& node : {"e", "f", "g", "h"})
+    EXPECT_EQ(-1, components[name_to_node[node]]);
+}
+}  // namespace
+
+TEST_F(SCCTest, WikipediaExample) {
+  // Graph with 4 SCCs:
+
+  // SCC1:
+  // a -> b
+  // b -> c
+  // c -> a
+
+  // d -> b
+  // d -> c
+
+  // SCC2:
+  // d -> e
+  // e -> d
+
+  // e -> f
+  // f -> c
+
+  // SCC3:
+  // f -> g
+  // g -> f
+
+  // h -> g
+  // h -> d
+
+  // SCC4:
+  // h -> h
+
+  // NodeDefs define inbound connections (inputs)
+  GraphDef graph;
+  *graph.add_node() = CreateNode("a", {"c"});
+  *graph.add_node() = CreateNode("b", {"a", "d"});
+  *graph.add_node() = CreateNode("c", {"b", "d", "f"});
+  *graph.add_node() = CreateNode("d", {"e"});
+  *graph.add_node() = CreateNode("e", {"d"});
+  *graph.add_node() = CreateNode("f", {"e", "g"});
+  *graph.add_node() = CreateNode("g", {"f", "h"});
+  *graph.add_node() = CreateNode("h", {"h"});
+
+  std::vector<const NodeDef*> nodes;
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& n : graph.node()) {
+    nodes.push_back(&n);
+    name_to_node[n.name()] = &n;
+  }
+
+  int num_components;
+  std::unordered_map<const NodeDef*, int> components;
+  StronglyConnectedComponents(graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 4);
+  for (const auto& pair :
+       {std::make_pair("a", "b"), std::make_pair("a", "c"),
+        std::make_pair("d", "e"), std::make_pair("f", "g")}) {
+    EXPECT_EQ(components[name_to_node[pair.first]],
+              components[name_to_node[pair.second]]);
+  }
+
+  for (const auto& pair :
+       {std::make_pair("a", "d"), std::make_pair("a", "f"),
+        std::make_pair("a", "h"), std::make_pair("d", "f"),
+        std::make_pair("d", "h"), std::make_pair("f", "h")}) {
+    EXPECT_NE(components[name_to_node[pair.first]],
+              components[name_to_node[pair.second]]);
+  }
+}
+
+TEST_F(SCCTest, TensorFlowLoop) {
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i = tf.constant(0)
+       c = lambda i: tf.less(i, 10)
+       b = lambda i: tf.add(i, 1)
+       r = tf.while_loop(c, b, [i])
+       with open('/tmp/graph.txt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/Add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 11
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+
+  std::unordered_map<const NodeDef*, int> components;
+  int num_components;
+  StronglyConnectedComponents(item.graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 2);
+  for (const auto& node : item.graph.node()) {
+    if (node.name() == "Const" || node.name() == "while/Enter" ||
+        node.name() == "while/Exit") {
+      // These nodes are not part of the loop, they should be assigned the id
+      // -1.
+      EXPECT_EQ(-1, components[&node]);
+    } else {
+      // These nodes are part of the loop, they should be assigned a positive
+      // id.
+      EXPECT_LE(0, components[&node]);
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
new file mode 100644
index 0000000000000000000000000000000000000000..131756fc5c2b2f7090934e791d6dfa7acf7ccfa7
--- /dev/null
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include <deque>
+#include <unordered_map>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Kahn's algorithm is implemented.
+// For details, see https://en.wikipedia.org/wiki/Topological_sorting
+void TopologicalSort(GraphDef* graph) {
+  NodeMap node_map(graph);
+  std::deque<const NodeDef*> ready_nodes;
+  std::unordered_map<const NodeDef*, int> ready_inputs;
+  for (const NodeDef& node : graph->node()) {
+    if (node.input_size() == 0) {
+      ready_nodes.push_back(&node);
+    }
+    if (node.op() == "Merge") {
+      ready_inputs[&node] = 0;
+      for (const auto& input : node.input()) {
+        if (node_map.GetNode(input)->op() == "NextIteration") {
+          ready_inputs[&node]++;
+        }
+      }
+    } else {
+      ready_inputs[&node] = 0;
+    }
+  }
+  GraphDef sorted_graph;
+  while (!ready_nodes.empty()) {
+    auto ready_node = ready_nodes.front();
+    *sorted_graph.add_node() = *ready_node;
+    for (const auto& fanout : node_map.GetOutputs(ready_node->name())) {
+      ready_inputs[fanout]++;
+      if (ready_inputs[fanout] == fanout->input_size()) {
+        ready_nodes.push_back(fanout);
+      }
+    }
+    ready_nodes.pop_front();
+  }
+  if (sorted_graph.node_size() == graph->node_size()) {
+    *graph = sorted_graph;
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4d8034ef577a0282dbce161aed8ba440bf248ab
--- /dev/null
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Sort a graph in topological order.
+void TopologicalSort(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55f66b273496c53d9450626ee0c896e725415a48
--- /dev/null
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class TopologicalSortTest : public ::testing::Test {
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            const std::vector<string>& inputs) {
+    return CreateNode(name, "", inputs);
+  }
+  static NodeDef CreateNode(const string& name, const string& op,
+                            const std::vector<string>& inputs) {
+    NodeDef node;
+    node.set_name(name);
+    if (!op.empty()) {
+      node.set_op(op);
+    }
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+};
+
+TEST_F(TopologicalSortTest, NoLoop) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("2", {"5"});
+  *graph.add_node() = CreateNode("0", {"5", "4"});
+  *graph.add_node() = CreateNode("1", {"4", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("5", {});
+  *graph.add_node() = CreateNode("4", {});
+
+  TopologicalSort(&graph);
+  std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
+TEST_F(TopologicalSortTest, WithLoop) {
+  GraphDef graph;
+  // Create a loop
+  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
+  *graph.add_node() = CreateNode("3", "Switch", {"2"});
+  *graph.add_node() = CreateNode("4", "Identity", {"3"});
+  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
+  *graph.add_node() = CreateNode("1", {});
+
+  TopologicalSort(&graph);
+  std::vector<string> order = {"1", "2", "3", "4", "5"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
+TEST_F(TopologicalSortTest, WithIllegalLoop) {
+  GraphDef graph;
+  // A loop without Merge and NextIteration is illegal and the original node
+  // order and graph will be preserved.
+  *graph.add_node() = CreateNode("2", {"1", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("1", {});
+
+  TopologicalSort(&graph);
+  std::vector<string> order = {"2", "3", "1"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index de947e5877ea9f4f2fe91bdab6790cd91ae7071f..7a980f8f68918e63941223d61645b88445297dc7 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -17,7 +17,10 @@ licenses(["notice"])  # Apache 2.0
 
 package_group(
     name = "friends",
-    packages = ["//tensorflow/..."],
+    packages = [
+        "//learning/brain/contrib/...",
+        "//tensorflow/...",
+    ],
 )
 
 load(
@@ -105,6 +108,7 @@ tf_kernel_library(
     deps = [
         ":bounds_check",
         ":ops_util",
+        ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -359,6 +363,18 @@ tf_kernel_library(
     alwayslink = 0,
 )
 
+cc_library(
+    name = "split_lib_hdrs",
+    hdrs = [
+        "split_lib.h",
+    ],
+    deps = [
+        ":eigen_helpers",
+        ":ops_util_hdrs",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "typed_queue",
     hdrs = ["typed_queue.h"],
@@ -367,6 +383,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "training_op_helpers",
+    srcs = ["training_op_helpers.cc"],
+    hdrs = ["training_op_helpers.h"],
+    visibility = [":friends"],
+    deps = [
+        ":variable_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "bounds_check",
     hdrs = ["bounds_check.h"],
@@ -377,6 +406,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "warn_about_ints",
+    srcs = ["warn_about_ints.cc"],
+    hdrs = ["warn_about_ints.h"],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+)
+
 # Private support libraries ---------------------------------------------------
 
 cc_header_only_library(
@@ -1290,6 +1328,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "lookup",
+    deps = [
+        ":lookup_table_init_op",
+        ":lookup_table_op",
+    ],
+)
+
 DATA_FLOW_DEPS = [
     ":bounds_check",
     ":concat_lib",
@@ -1413,10 +1459,10 @@ LOOKUP_DEPS = [
     ":initializable_lookup_table",
     ":lookup_util",
     "//tensorflow/core:core_cpu",
-    "//tensorflow/core:data_flow_ops_op_lib",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
+    "//tensorflow/core:lookup_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -1579,9 +1625,7 @@ cc_library(
         ":attention_ops",
         ":colorspace_op",
         ":crop_and_resize_op",
-        ":decode_gif_op",
-        ":decode_jpeg_op",
-        ":decode_png_op",
+        ":decode_image_op",
         ":draw_bounding_box_op",
         ":encode_jpeg_op",
         ":encode_png_op",
@@ -1646,20 +1690,8 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "decode_jpeg_op",
-    prefix = "decode_jpeg_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "decode_png_op",
-    prefix = "decode_png_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "decode_gif_op",
-    prefix = "decode_gif_op",
+    name = "decode_image_op",
+    prefix = "decode_image_op",
     deps = IMAGE_DEPS,
 )
 
@@ -1996,15 +2028,24 @@ tf_kernel_library(
     name = "cuda_solvers",
     srcs = ["cuda_solvers.cc"],
     hdrs = ["cuda_solvers.h"],
+    gpu_srcs = [
+        "cuda_solvers.h",
+        "cuda_solvers_gpu.cu.cc",
+    ],
     # @local_config_cuda//cuda:cusolver, //third_party/eigen3:blas,
     # and //third_party/libf2c all contain various parts of BLAS, LAPACK,
     # and f2c helper functions in global namespace. Tell the compiler to
     # allow multiple definitions when linking this.
-    linkopts = ["-Wl,-zmuldefs"],
+    linkopts = select({
+        "//tensorflow:darwin": [],
+        "//conditions:default": ["-Wl,-z,muldefs"],
+    }),
     visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "@local_config_cuda//cuda:cublas",
         "@local_config_cuda//cuda:cusolver",
     ],
 )
@@ -2053,7 +2094,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "matrix_inverse_op",
     prefix = "matrix_inverse_op",
-    deps = LINALG_DEPS,
+    deps = if_cuda([
+        ":cuda_solvers",
+    ]) + LINALG_DEPS,
 )
 
 tf_kernel_library(
@@ -2071,7 +2114,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "matrix_triangular_solve_op",
     prefix = "matrix_triangular_solve_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2212,6 +2257,7 @@ cc_library(
         ":batch_matmul_op",
         ":betainc_op",
         ":bincount_op",
+        ":bucketize_op",
         ":cast_op",
         ":check_numerics_op",
         ":cross_op",
@@ -2249,6 +2295,12 @@ tf_kernel_library(
     deps = MATH_DEPS,
 )
 
+tf_kernel_library(
+    name = "bucketize_op",
+    prefix = "bucketize_op",
+    deps = MATH_DEPS,
+)
+
 tf_kernel_library(
     name = "cast_op",
     prefix = "cast_op",
@@ -2278,7 +2330,9 @@ tf_kernel_library(
     prefix = "fft_ops",
     deps = MATH_DEPS + [
         "//tensorflow/core:spectral_ops_op_lib",
-    ],
+    ] + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cufft_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2303,6 +2357,8 @@ tf_kernel_library(
         "//conditions:default": [],
     }) + if_mkl([
         "//third_party/mkl:intel_binary_blob",
+    ]) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
     ]),
 )
 
@@ -2582,7 +2638,10 @@ tf_kernel_library(
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2696,13 +2755,13 @@ tf_kernel_library(
 tf_kernel_library(
     name = "softplus_op",
     prefix = "softplus_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + [":warn_about_ints"],
 )
 
 tf_kernel_library(
     name = "softsign_op",
     prefix = "softsign_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + [":warn_about_ints"],
 )
 
 tf_kernel_library(
@@ -3101,6 +3160,7 @@ cc_library(
         ":sparse_add_grad_op",
         ":sparse_add_op",
         ":sparse_concat_op",
+        ":sparse_cross_op",
         ":sparse_dense_binary_op_shared",
         ":sparse_reduce_sum_op",
         ":sparse_reorder_op",
@@ -3145,6 +3205,12 @@ tf_kernel_library(
     deps = SPARSE_DEPS,
 )
 
+tf_kernel_library(
+    name = "sparse_cross_op",
+    prefix = "sparse_cross_op",
+    deps = SPARSE_DEPS,
+)
+
 tf_kernel_library(
     name = "sparse_reduce_sum_op",
     prefix = "sparse_reduce_sum_op",
@@ -3474,6 +3540,7 @@ tf_kernel_library(
     prefix = "training_ops",
     deps = [
         ":bounds_check",
+        ":training_op_helpers",
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -4005,6 +4072,7 @@ filegroup(
         "relu_op.h",
         "relu_op_functor.h",
         "resize_bilinear_op.h",
+        "resize_nearest_neighbor_op.h",
         "reverse_op.h",
         "save_restore_tensor.h",
         "softplus_op.h",
@@ -4014,9 +4082,11 @@ filegroup(
         "tensor_array.h",
         "tile_ops_cpu_impl.h",
         "tile_ops_impl.h",
+        "training_op_helpers.h",
         "training_ops.h",
         "transpose_functor.h",
         "transpose_op.h",
+        "warn_about_ints.h",
         "where_op.h",
         "xent_op.h",
     ],
@@ -4049,6 +4119,7 @@ filegroup(
         "cwise_op_equal_to_2.cc",
         "cwise_op_exp.cc",
         "cwise_op_floor.cc",
+        "cwise_op_floor_div.cc",
         "cwise_op_greater.cc",
         "cwise_op_greater_equal.cc",
         "cwise_op_isfinite.cc",
@@ -4113,6 +4184,7 @@ filegroup(
         "queue_base.cc",
         "queue_ops.cc",
         "random_op.cc",
+        "reduction_ops_all.cc",
         "reduction_ops_any.cc",
         "reduction_ops_common.cc",
         "reduction_ops_max.cc",
@@ -4149,9 +4221,11 @@ filegroup(
         "tile_ops_cpu_impl_6.cc",
         "tile_ops_cpu_impl_7.cc",
         "topk_op.cc",
+        "training_op_helpers.cc",
         "training_ops.cc",
         "transpose_functor_cpu.cc",
         "transpose_op.cc",
+        "warn_about_ints.cc",
         "where_op.cc",
         "xent_op.cc",
         ":android_extended_ops_headers",
@@ -4215,13 +4289,12 @@ filegroup(
             "string_to_hash_bucket_op.*",
             "sdca_ops.*",
             "sdca_internal.*",
+            "sparse_cross_op.*",
             "text_line_reader_op.*",
             "summary_image_op.*",
+            "decode_image_op.*",
             "encode_png_op.*",
-            "decode_png_op.*",
             "encode_jpeg_op.*",
-            "decode_jpeg_op.*",
-            "decode_gif_op.*",
             "identity_reader_op.*",
             "remote_fused_graph_execute_op.*",
             "fixed_length_record_reader_op.*",
@@ -4734,6 +4807,7 @@ tf_cc_test(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -4857,6 +4931,14 @@ tf_mkl_kernel_library(
     ],
 )
 
+tf_mkl_kernel_library(
+    name = "mkl_identity_op",
+    prefix = "mkl_identity_op",
+    deps = ARRAY_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
 tf_mkl_kernel_library(
     name = "mkl_lrn_op",
     prefix = "mkl_lrn_op",
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 03880b98273af7aa7eb174f128c2ee869107ed32..83633a1dd98f172aab66088826282b28a8fb217b 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -88,7 +88,7 @@ class Barrier : public ResourceBase {
   template <typename T>
   void TryInsertMany(const Tensor& keys, int component_index,
                      const Tensor& values, OpKernelContext* ctx,
-                     DoneCallback callback) {
+                     const DoneCallback& callback) {
     TensorShape element_shape = values.shape();
     OP_REQUIRES_ASYNC(
         ctx, keys.NumElements() == 0 || element_shape.num_elements() > 0,
@@ -195,7 +195,8 @@ class Barrier : public ResourceBase {
   }
 
   void TryTakeMany(int num_elements, bool allow_small_batch, int64 timeout,
-                   OpKernelContext* ctx, IndicesKeysValuesCallback callback) {
+                   OpKernelContext* ctx,
+                   const IndicesKeysValuesCallback& callback) {
     int num_elements_to_deliver = num_elements;
     {
       mutex_lock lock(mu_);
@@ -247,7 +248,7 @@ class Barrier : public ResourceBase {
   }
 
   void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
-             DoneCallback callback) {
+             const DoneCallback& callback) {
     mutex_lock lock(mu_);
     // We're allowed to close twice if the first close wasn't a
     // cancel but the second one is.
@@ -399,7 +400,8 @@ class Barrier : public ResourceBase {
   }
 
   void CloseQueueLocked(OpKernelContext* ctx, bool cancel_pending_enqueues,
-                        DoneCallback callback) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                        const DoneCallback& callback)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // CloseQueueLocked may only be called with mu_ held.
     if (!cancel_pending_enqueues && queue_closed_) {
       callback();
diff --git a/tensorflow/core/kernels/basic_ops_benchmark_test.cc b/tensorflow/core/kernels/basic_ops_benchmark_test.cc
index 54532318cec8e40f13c1e97232c12e7406834de1..5726062938bc230911d74c26865b765533d127fa 100644
--- a/tensorflow/core/kernels/basic_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/basic_ops_benchmark_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-// We focus on the single thread performance of runing ops.
+// We focus on the single thread performance of running ops.
 static SessionOptions InitOptions() {
   SessionOptions opts;
   opts.config.set_intra_op_parallelism_threads(1);
diff --git a/tensorflow/contrib/layers/kernels/bucketization_kernel.cc b/tensorflow/core/kernels/bucketize_op.cc
similarity index 98%
rename from tensorflow/contrib/layers/kernels/bucketization_kernel.cc
rename to tensorflow/core/kernels/bucketize_op.cc
index 5cfa39de7645c982d094e012a55e5265adb26bbb..93c2d01221f3b1d36fefa7742762025b96cc5387 100644
--- a/tensorflow/contrib/layers/kernels/bucketization_kernel.cc
+++ b/tensorflow/core/kernels/bucketize_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// See docs in ../ops/math_ops.cc.
+
 #include <algorithm>
 #include <vector>
 
diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc
index 2e33170d27bfc04f7c936a120b3fd4e5d59c412c..9a0ea2e303f34abfc0dfc9d2d631d542c4430514 100644
--- a/tensorflow/core/kernels/cholesky_op.cc
+++ b/tensorflow/core/kernels/cholesky_op.cc
@@ -90,45 +90,82 @@ TF_CALL_double(DECLARE_GPU_SPEC);
 }  // namespace functor
 
 template <class Scalar>
-class CholeskyOpGpu : public LinearAlgebraOp<Scalar> {
+class CholeskyOpGpu : public AsyncOpKernel {
  public:
-  INHERIT_LINALG_TYPEDEFS(Scalar);
-
-  explicit CholeskyOpGpu(OpKernelConstruction* context) : Base(context) {}
-
-  // Copy the lower triangular part of the input matrices to the output and
-  // set the strictly upper triangular part to zero. We use a pre-existing
-  // kernel MatrixBandPart to do this for all matrices in the batch at once,
-  // before we launch each of the Cholesky factorization kernels in parallel.
-  void BatchPreCompute(OpKernelContext* context, const TensorInputs& inputs,
-                       const TensorShapes& input_matrix_shapes,
-                       const TensorOutputs& outputs,
-                       const TensorShapes& output_matrix_shapes) final {
-    const int n = input_matrix_shapes[0].dim_size(0);
-    auto input_reshaped = inputs[0]->template flat_inner_dims<Scalar, 3>();
-    auto output_reshaped = outputs[0]->template flat_inner_dims<Scalar, 3>();
-    functor::MatrixBandPart<GPUDevice, Scalar>::Compute(
-        context->eigen_device<GPUDevice>(), n, 0, input_reshaped,
-        output_reshaped);
-  }
+  explicit CholeskyOpGpu(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+    const int ndims = input.dims();
+    const int64 n = input.dim_size(ndims - 1);
+    // Validate inputs.
+    OP_REQUIRES_ASYNC(
+        context, ndims >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
+        done);
+    OP_REQUIRES_ASYNC(
+        context, input.dim_size(ndims - 2) == n,
+        errors::InvalidArgument("Input matrices must be squares, got",
+                                input.dim_size(ndims - 2), " != ", n),
+        done);
+
+    // Allocate output.
+    Tensor* output;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->forward_input_or_allocate_output(
+                             {0}, 0, input.shape(), &output),
+                         done);
 
-  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
-                     MatrixMaps* outputs) final {
-    const ConstMatrixMap& input = inputs[0];
-    const int n = input.rows();
     if (n == 0) {
       // If X is an empty matrix (0 rows, 0 col), X * X' == X.
       // Therefore, we return X.
+      done();
       return;
     }
-    // Launch the Cholesky kernel.
-    CudaSolverDN cusolver(context);
-    const Status status = cusolver.potrf(CUBLAS_FILL_MODE_UPPER, n,
-                                         outputs->at(0).data(), n, nullptr);
-    if (!status.ok()) {
-      LOG(ERROR) << status.ToString();
+
+    // Copy the lower triangular part of the input matrices to the output and
+    // set the strictly upper triangular part to zero. We use a pre-existing
+    // kernel MatrixBandPart to do this for all matrices in the batch at once,
+    // before we launch each of the Cholesky factorization kernels in paralle.
+    auto input_reshaped = input.template flat_inner_dims<Scalar, 3>();
+    auto output_reshaped = output->template flat_inner_dims<Scalar, 3>();
+    functor::MatrixBandPart<GPUDevice, Scalar>::Compute(
+        context->eigen_device<GPUDevice>(), n, 0, input_reshaped,
+        output_reshaped);
+
+    // Launch a Cholesky kernel for each matrix in the batch.
+    const int64 batch_size = input_reshaped.dimension(0);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.emplace_back(context, batch_size, "potrf");
+    // TODO(rmlarsen): Parallelize over batches if it turns out to be
+    // an important use case.
+    CudaSolver solver(context);
+    for (int64 i = 0; i < batch_size; ++i) {
+      Scalar* output_ptr = output_reshaped.data() + i * n * n;
+      int* dev_info_ptr = dev_info.back().mutable_data() + i;
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          solver.Potrf(CUBLAS_FILL_MODE_UPPER, n, output_ptr, n, dev_info_ptr),
+          done);
     }
-    OP_REQUIRES(context, status.ok(), errors::InvalidArgument(kErrMsg));
+
+    // Register callback to check info after kernels finish.
+    auto info_checker = [context, done](
+                            const Status& status,
+                            const std::vector<HostLapackInfo>& /* unused */) {
+      Status full_status = status;
+      if (!full_status.ok()) {
+        full_status.Update(errors::InvalidArgument(kErrMsg));
+      }
+      OP_REQUIRES_OK_ASYNC(context, full_status, done);
+      done();
+    };
+
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.CopyLapackInfoToHostAsync(dev_info, std::move(info_checker)),
+        done);
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 219e6d5e978b407946ea94c971837f04d70e5604..f7348f1077260367a61dea382b6dfb5b5cc2fb79 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -74,8 +74,9 @@ enum SamplingMode {
 //       my_vector[current] *= 10.0f;
 //     }
 // });
-void FusedConvParallelFor(OpKernelContext* context, int64 begin, int64 end,
-                          std::function<void(int64, int64)> task_function) {
+void FusedConvParallelFor(
+    OpKernelContext* context, int64 begin, int64 end,
+    const std::function<void(int64, int64)>& task_function) {
 // On iOS, the thread management imposes a very big performance penalty, so
 // just call the function directly with no multithreading.
 #if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index b122e7f0e847dc6bdfeb313b8a81bb48655aa059..cd9aa4a53efface3654dc405887d6fd82dfacf04 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -116,8 +116,9 @@ class FusedResizePadConvOpTest : public OpsTestBase {
                                int input_depth, int resize_width,
                                int resize_height, int y_padding, int x_padding,
                                int filter_size, int filter_count,
-                               bool resize_align_corners, string pad_mode,
-                               int stride, string padding) {
+                               bool resize_align_corners,
+                               const string& pad_mode, int stride,
+                               const string& padding) {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
@@ -170,8 +171,8 @@ class FusedResizePadConvOpTest : public OpsTestBase {
   void CompareFusedPadOnlyAndSeparate(int input_width, int input_height,
                                       int input_depth, int y_padding,
                                       int x_padding, int filter_size,
-                                      int filter_count, string pad_mode,
-                                      int stride, string padding) {
+                                      int filter_count, const string& pad_mode,
+                                      int stride, const string& padding) {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 746fe63e2a0f39acc0ccba1bb2f7a192ae06b868..c68a8b0bd27cd0fc1c3a5e93e1ff713ff03741e0 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/crop_and_resize_op.h"
 
+#include <functional>
+#include <string>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,10 +29,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
@@ -37,41 +43,67 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+using Callback = std::function<void()>;
+
+namespace {
 
-static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
-                                         const Tensor& boxes,
-                                         const Tensor& box_ind,
-                                         int* num_boxes) {
-  if (boxes.NumElements() == 0 && box_ind.NumElements() == 0) {
+static inline Status ParseAndCheckBoxSizes(const Tensor& boxes,
+                                           const Tensor& box_index,
+                                           int* num_boxes) {
+  if (boxes.NumElements() == 0 && box_index.NumElements() == 0) {
     *num_boxes = 0;
-    return;
+    return Status::OK();
   }
   // The shape of 'boxes' is [num_boxes, 4].
-  OP_REQUIRES(context, boxes.dims() == 2,
-              errors::InvalidArgument("boxes must be 2-D",
-                                      boxes.shape().DebugString()));
+  if (boxes.dims() != 2) {
+    return errors::InvalidArgument("boxes must be 2-D",
+                                   boxes.shape().DebugString());
+  }
   *num_boxes = boxes.dim_size(0);
-  OP_REQUIRES(context, boxes.dim_size(1) == 4,
-              errors::InvalidArgument("boxes must have 4 columns"));
-
-  // The shape of 'box_ind' is [num_boxes].
-  OP_REQUIRES(context, box_ind.dims() == 1,
-              errors::InvalidArgument("box_ind must be 1-D",
-                                      box_ind.shape().DebugString()));
-  OP_REQUIRES(context, box_ind.dim_size(0) == *num_boxes,
-              errors::InvalidArgument("box_ind has incompatible shape"));
+  if (boxes.dim_size(1) != 4) {
+    return errors::InvalidArgument("boxes must have 4 columns");
+  }
+  // The shape of 'box_index' is [num_boxes].
+  if (box_index.dims() != 1) {
+    return errors::InvalidArgument("box_index must be 1-D",
+                                   box_index.shape().DebugString());
+  }
+  if (box_index.dim_size(0) != *num_boxes) {
+    return errors::InvalidArgument("box_index has incompatible shape");
+  }
+  return Status::OK();
 }
 
-// Verifies that all values in box_ind are in [0, batch).
+// Conditionally calls the compute callback if all values in box_index are in
+// [0, batch_size) then calls done.
 template <typename Device>
-inline void CheckValidBoxInd(
-    OpKernelContext* context,
-    typename TTypes<int32, 1>::ConstTensor box_ind_data, int batch);
+inline void RunIfBoxIndexIsValid(
+    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, const Callback& compute, const Callback& done);
+
+// Specialization of CheckValidBoxIndex for a CPUDevice.
+template <>
+inline void RunIfBoxIndexIsValid<CPUDevice>(
+    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, const Callback& compute, const Callback& done) {
+  const int num_boxes = box_index.dimension(0);
+  for (int b = 0; b < num_boxes; ++b) {
+    OP_REQUIRES_ASYNC(
+        context, FastBoundsCheck(box_index(b), batch_size),
+        errors::OutOfRange("box_index has values outside [0, batch_size)"),
+        done);
+  }
+  compute();
+  done();
+}
+
+}  // namespace
 
 template <typename Device, typename T>
-class CropAndResizeOp : public OpKernel {
+class CropAndResizeOp : public AsyncOpKernel {
  public:
-  explicit CropAndResizeOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit CropAndResizeOp(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {
     string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
@@ -80,69 +112,77 @@ class CropAndResizeOp : public OpKernel {
                                              &extrapolation_value_));
   }
 
-  void Compute(OpKernelContext* context) override {
-    // The shape of 'image' is [batch, image_height, image_width, channels].
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    // The shape of 'image' is [batch_size, image_height, image_width,
+    // channels].
     const Tensor& image = context->input(0);
-    OP_REQUIRES(context, image.dims() == 4,
-                errors::InvalidArgument("input image must be 4-D",
-                                        image.shape().DebugString()));
-
-    const int batch = image.dim_size(0);
-    const int image_height = image.dim_size(1);
-    const int image_width = image.dim_size(2);
-    const int depth = image.dim_size(3);
-    OP_REQUIRES(context, image_height > 0 && image_width > 0,
-                errors::InvalidArgument("image dimensions must be positive"));
-
     // The shape of 'boxes' is [num_boxes, 4].
     const Tensor& boxes = context->input(1);
-
-    // The shape of 'box_ind' is [num_boxes].
-    const Tensor& box_ind = context->input(2);
-
-    int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, box_ind, &num_boxes);
-
+    // The shape of 'box_index' is [num_boxes].
+    const Tensor& box_index = context->input(2);
     // The shape of 'crop_size' is [2].
     const Tensor& crop_size = context->input(3);
 
-    OP_REQUIRES(context, crop_size.dims() == 1,
-                errors::InvalidArgument("crop_size must be 1-D",
-                                        crop_size.shape().DebugString()));
-    OP_REQUIRES(context, crop_size.dim_size(0) == 2,
-                errors::InvalidArgument("crop_size must have two elements",
-                                        crop_size.shape().DebugString()));
-
+    // Validate inputs dimensions.
+    OP_REQUIRES_ASYNC(context, image.dims() == 4,
+                      errors::InvalidArgument("input image must be 4-D",
+                                              image.shape().DebugString()),
+                      done);
+    const int batch_size = image.dim_size(0);
+    const int image_height = image.dim_size(1);
+    const int image_width = image.dim_size(2);
+    const int depth = image.dim_size(3);
+    OP_REQUIRES_ASYNC(
+        context, image_height > 0 && image_width > 0,
+        errors::InvalidArgument("image dimensions must be positive"), done);
+    int num_boxes = 0;
+    OP_REQUIRES_OK_ASYNC(
+        context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
+
+    OP_REQUIRES_ASYNC(context, crop_size.dims() == 1,
+                      errors::InvalidArgument("crop_size must be 1-D",
+                                              crop_size.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(
+        context, crop_size.dim_size(0) == 2,
+        errors::InvalidArgument("crop_size must have two elements",
+                                crop_size.shape().DebugString()),
+        done);
+
+    // Copy and validate crop sizes.
     auto crop_size_vec = crop_size.vec<int32>();
     const int crop_height = internal::SubtleMustCopy(crop_size_vec(0));
     const int crop_width = internal::SubtleMustCopy(crop_size_vec(1));
-    OP_REQUIRES(context, crop_height > 0 && crop_width > 0,
-                errors::InvalidArgument("crop dimensions must be positive"));
+    OP_REQUIRES_ASYNC(
+        context, crop_height > 0 && crop_width > 0,
+        errors::InvalidArgument("crop dimensions must be positive"), done);
 
     // Allocate output tensor.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(
+    OP_REQUIRES_OK_ASYNC(
         context,
         context->allocate_output(
             0, TensorShape({num_boxes, crop_height, crop_width, depth}),
-            &output));
-
-    typename TTypes<T, 4>::ConstTensor image_data = image.tensor<T, 4>();
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-    typename TTypes<int32, 1>::ConstTensor box_ind_data =
-        box_ind.tensor<int32, 1>();
-    typename TTypes<float, 4>::Tensor crops_data = output->tensor<float, 4>();
-
-    CheckValidBoxInd<Device>(context, box_ind_data, batch);
-
-    bool status = functor::CropAndResize<Device, T>()(
-        context->eigen_device<Device>(), image_data, boxes_data, box_ind_data,
-        extrapolation_value_, crops_data);
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launch CropAndResizeKernel."));
-    }
+            &output),
+        done);
+
+    auto compute_callback = [this, context, output]() {
+      const Tensor& image = context->input(0);
+      const Tensor& boxes = context->input(1);
+      const Tensor& box_index = context->input(2);
+      const bool status = functor::CropAndResize<Device, T>()(
+          context->eigen_device<Device>(), image.tensor<T, 4>(),
+          boxes.tensor<float, 2>(), box_index.tensor<int32, 1>(),
+          extrapolation_value_, output->tensor<float, 4>());
+      if (!status) {
+        context->SetStatus(
+            errors::Internal("Failed launch CropAndResizeKernel."));
+      }
+    };
+
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+                                 batch_size, std::move(compute_callback),
+                                 std::move(done));
   }
 
  private:
@@ -155,10 +195,10 @@ template <typename T>
 struct CropAndResize<CPUDevice, T> {
   bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32, 1>::ConstTensor box_index,
                   float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
-    const int batch = image.dimension(0);
+    const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
     const int image_width = image.dimension(2);
 
@@ -173,8 +213,8 @@ struct CropAndResize<CPUDevice, T> {
       const float y2 = boxes(b, 2);
       const float x2 = boxes(b, 3);
 
-      const int32 b_in = box_ind(b);
-      if (b_in < 0 || b_in >= batch) {
+      const int32 b_in = box_index(b);
+      if (!FastBoundsCheck(b_in, batch_size)) {
         continue;
       }
 
@@ -235,89 +275,94 @@ struct CropAndResize<CPUDevice, T> {
     return true;
   }
 };
+
 }  // namespace functor
 
 template <typename Device, typename T>
-class CropAndResizeGradImageOp : public OpKernel {
+class CropAndResizeGradImageOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradImageOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+      : AsyncOpKernel(context) {
     string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
                 errors::InvalidArgument("method must be 'bilinear'", method));
   }
 
-  void Compute(OpKernelContext* context) override {
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
     // The shape of 'grads' is [num_boxes, crop_height, crop_width, depth].
     const Tensor& grads = context->input(0);
-
-    OP_REQUIRES(context, grads.dims() == 4,
-                errors::InvalidArgument("grads image must be 4-D",
-                                        grads.shape().DebugString()));
-    const int crop_height = grads.dim_size(1);
-    const int crop_width = grads.dim_size(2);
-    OP_REQUIRES(context, crop_height > 0 && crop_width > 0,
-                errors::InvalidArgument("grads dimensions must be positive"));
-
     // The shape of 'boxes' is [num_boxes, 4].
     const Tensor& boxes = context->input(1);
-
-    // The shape of 'box_ind' is [num_boxes].
-    const Tensor& box_ind = context->input(2);
-
-    int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, box_ind, &num_boxes);
-
-    OP_REQUIRES(
-        context, grads.dim_size(0) == num_boxes,
-        errors::InvalidArgument("boxes and grads have incompatible shape"));
-
+    // The shape of 'box_index' is [num_boxes].
+    const Tensor& box_index = context->input(2);
     // The shape of 'image_size' is [4].
     const Tensor& image_size = context->input(3);
-    OP_REQUIRES(context, image_size.dims() == 1,
-                errors::InvalidArgument("image_size must be 1-D",
-                                        image_size.shape().DebugString()));
-    OP_REQUIRES(context, image_size.dim_size(0) == 4,
-                errors::InvalidArgument("image_size must have 4 elements",
-                                        image_size.shape().DebugString()));
 
+    // Validate input shapes.
+    OP_REQUIRES_ASYNC(context, grads.dims() == 4,
+                      errors::InvalidArgument("grads image must be 4-D",
+                                              grads.shape().DebugString()),
+                      done);
+    const int crop_height = grads.dim_size(1);
+    const int crop_width = grads.dim_size(2);
+    OP_REQUIRES_ASYNC(
+        context, crop_height > 0 && crop_width > 0,
+        errors::InvalidArgument("grads dimensions must be positive"), done);
+    int num_boxes = 0;
+    OP_REQUIRES_OK_ASYNC(
+        context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
+    OP_REQUIRES_ASYNC(
+        context, grads.dim_size(0) == num_boxes,
+        errors::InvalidArgument("boxes and grads have incompatible shape"),
+        done);
+
+    OP_REQUIRES_ASYNC(context, image_size.dims() == 1,
+                      errors::InvalidArgument("image_size must be 1-D",
+                                              image_size.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(context, image_size.dim_size(0) == 4,
+                      errors::InvalidArgument("image_size must have 4 elements",
+                                              image_size.shape().DebugString()),
+                      done);
     auto image_size_vec = image_size.vec<int32>();
-    const int batch = internal::SubtleMustCopy(image_size_vec(0));
+    const int batch_size = internal::SubtleMustCopy(image_size_vec(0));
     const int image_height = internal::SubtleMustCopy(image_size_vec(1));
     const int image_width = internal::SubtleMustCopy(image_size_vec(2));
     const int depth = internal::SubtleMustCopy(image_size_vec(3));
-
-    OP_REQUIRES(context, image_height > 0 && image_width > 0,
-                errors::InvalidArgument("image dimensions must be positive"));
-    OP_REQUIRES(
+    OP_REQUIRES_ASYNC(
+        context, image_height > 0 && image_width > 0,
+        errors::InvalidArgument("image dimensions must be positive"), done);
+    OP_REQUIRES_ASYNC(
         context, grads.dim_size(3) == depth,
-        errors::InvalidArgument("image_size and grads are incompatible"));
+        errors::InvalidArgument("image_size and grads are incompatible"), done);
 
     // Allocate output tensor.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(
-                     0, TensorShape({batch, image_height, image_width, depth}),
-                     &output));
-
-    typename TTypes<float, 4>::ConstTensor grads_data =
-        grads.tensor<float, 4>();
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-    typename TTypes<int32, 1>::ConstTensor box_ind_data =
-        box_ind.tensor<int32, 1>();
-    typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
-
-    CheckValidBoxInd<Device>(context, box_ind_data, batch);
-
-    bool status = functor::CropAndResizeBackpropImage<Device, T>()(
-        context->eigen_device<Device>(), grads_data, boxes_data, box_ind_data,
-        output_data);
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launch CropAndResizeBackpropImageKernel."));
-    }
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_output(
+            0, TensorShape({batch_size, image_height, image_width, depth}),
+            &output),
+        done);
+
+    auto compute_callback = [context, output]() {
+      const Tensor& grads = context->input(0);
+      const Tensor& boxes = context->input(1);
+      const Tensor& box_index = context->input(2);
+      const bool status = functor::CropAndResizeBackpropImage<Device, T>()(
+          context->eigen_device<Device>(), grads.tensor<float, 4>(),
+          boxes.tensor<float, 2>(), box_index.tensor<int32, 1>(),
+          output->tensor<T, 4>());
+      if (!status) {
+        context->SetStatus(errors::Internal(
+            "Failed launch CropAndResizeBackpropImage kernel."));
+      }
+    };
+
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+                                 batch_size, std::move(compute_callback),
+                                 std::move(done));
   }
 };
 
@@ -328,9 +373,9 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
   bool operator()(const CPUDevice& d,
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32, 1>::ConstTensor box_index,
                   typename TTypes<T, 4>::Tensor grads_image) {
-    const int batch = grads_image.dimension(0);
+    const int batch_size = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
 
@@ -347,8 +392,8 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
       const float y2 = boxes(b, 2);
       const float x2 = boxes(b, 3);
 
-      const int32 b_in = box_ind(b);
-      if (b_in < 0 || b_in >= batch) {
+      const int32 b_in = box_index(b);
+      if (!FastBoundsCheck(b_in, batch_size)) {
         continue;
       }
 
@@ -399,83 +444,90 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
     return true;
   }
 };
+
 }  // namespace functor
 
 template <typename Device, typename T>
-class CropAndResizeGradBoxesOp : public OpKernel {
+class CropAndResizeGradBoxesOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradBoxesOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+      : AsyncOpKernel(context) {
     string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
                 errors::InvalidArgument("method must be 'bilinear'", method));
   }
 
-  void Compute(OpKernelContext* context) override {
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
     // The shape of 'grads' is [num_boxes, crop_height, crop_width, depth].
     const Tensor& grads = context->input(0);
+    // The shape of 'boxes' is [num_boxes, 4].
+    const Tensor& boxes = context->input(2);
+    // The shape of 'box_index' is [num_boxes].
+    const Tensor& box_index = context->input(3);
+    // The shape of 'image' is [batch_size, image_height, image_width, depth].
+    const Tensor& image = context->input(1);
 
-    OP_REQUIRES(context, grads.dims() == 4,
-                errors::InvalidArgument("grads image must be 4-D",
-                                        grads.shape().DebugString()));
-
+    // Validate input shapes.
+    OP_REQUIRES_ASYNC(context, grads.dims() == 4,
+                      errors::InvalidArgument("grads image must be 4-D",
+                                              grads.shape().DebugString()),
+                      done);
     const int crop_height = grads.dim_size(1);
     const int crop_width = grads.dim_size(2);
     const int depth = grads.dim_size(3);
-    OP_REQUIRES(context, crop_height > 0 && crop_width > 0,
-                errors::InvalidArgument("grads dimensions must be positive"));
-
-    // The shape of 'image' is [batch, image_height, image_width, depth].
-    const Tensor& image = context->input(1);
-    OP_REQUIRES(context, image.dims() == 4,
-                errors::InvalidArgument("input image must be 4-D",
-                                        image.shape().DebugString()));
-
-    const int batch = image.dim_size(0);
+    OP_REQUIRES_ASYNC(
+        context, crop_height > 0 && crop_width > 0,
+        errors::InvalidArgument("grads dimensions must be positive"), done);
+
+    OP_REQUIRES_ASYNC(context, image.dims() == 4,
+                      errors::InvalidArgument("input image must be 4-D",
+                                              image.shape().DebugString()),
+                      done);
+    const int batch_size = image.dim_size(0);
     const int image_height = image.dim_size(1);
     const int image_width = image.dim_size(2);
-    OP_REQUIRES(context, image_height > 0 && image_width > 0,
-                errors::InvalidArgument("image dimensions must be positive"));
-    OP_REQUIRES(context, image.dim_size(3) == depth,
-                errors::InvalidArgument("image, grads depth differ"));
-
-    // The shape of 'boxes' is [num_boxes, 4].
-    const Tensor& boxes = context->input(2);
-
-    // The shape of 'box_ind' is [num_boxes].
-    const Tensor& box_ind = context->input(3);
+    OP_REQUIRES_ASYNC(
+        context, image_height > 0 && image_width > 0,
+        errors::InvalidArgument("image dimensions must be positive"), done);
+    OP_REQUIRES_ASYNC(context, image.dim_size(3) == depth,
+                      errors::InvalidArgument("image, grads depth differ"),
+                      done);
 
     int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, box_ind, &num_boxes);
+    OP_REQUIRES_OK_ASYNC(
+        context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
 
-    OP_REQUIRES(
+    OP_REQUIRES_ASYNC(
         context, grads.dim_size(0) == num_boxes,
-        errors::InvalidArgument("boxes and grads have incompatible shape"));
+        errors::InvalidArgument("boxes and grads have incompatible shape"),
+        done);
 
     // Allocate output tensor.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({num_boxes, 4}), &output));
-
-    typename TTypes<float, 4>::ConstTensor grads_data =
-        grads.tensor<float, 4>();
-    typename TTypes<T, 4>::ConstTensor image_data = image.tensor<T, 4>();
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-    typename TTypes<int32, 1>::ConstTensor box_ind_data =
-        box_ind.tensor<int32, 1>();
-    typename TTypes<float, 2>::Tensor output_data = output->tensor<float, 2>();
-
-    CheckValidBoxInd<Device>(context, box_ind_data, batch);
-
-    bool status = functor::CropAndResizeBackpropBoxes<Device, T>()(
-        context->eigen_device<Device>(), grads_data, image_data, boxes_data,
-        box_ind_data, output_data);
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launch CropAndResizeBackpropBoxesKernel."));
-    }
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_output(0, TensorShape({num_boxes, 4}), &output),
+        done);
+
+    auto compute_callback = [context, output]() {
+      const Tensor& grads = context->input(0);
+      const Tensor& image = context->input(1);
+      const Tensor& boxes = context->input(2);
+      const Tensor& box_index = context->input(3);
+      const bool status = functor::CropAndResizeBackpropBoxes<Device, T>()(
+          context->eigen_device<Device>(), grads.tensor<float, 4>(),
+          image.tensor<T, 4>(), boxes.tensor<float, 2>(),
+          box_index.tensor<int32, 1>(), output->tensor<float, 2>());
+      if (!status) {
+        context->SetStatus(errors::Internal(
+            "Failed launch CropAndResizeBackpropBoxes kernel."));
+      }
+    };
+
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+                                 batch_size, std::move(compute_callback),
+                                 std::move(done));
   }
 };
 
@@ -487,9 +539,9 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32, 1>::ConstTensor box_index,
                   typename TTypes<float, 2>::Tensor grads_boxes) {
-    const int batch = image.dimension(0);
+    const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
     const int image_width = image.dimension(2);
 
@@ -506,8 +558,8 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
       const float y2 = boxes(b, 2);
       const float x2 = boxes(b, 3);
 
-      const int32 b_in = box_ind(b);
-      if (b_in < 0 || b_in >= batch) {
+      const int32 b_in = box_index(b);
+      if (!FastBoundsCheck(b_in, batch_size)) {
         continue;
       }
 
@@ -589,30 +641,19 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
     return true;
   }
 };
-}  // namespace functor
 
-// Specialization of CheckValidBoxInd for a CPUDevice.
-template <>
-inline void CheckValidBoxInd<CPUDevice>(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_ind,
-    int batch) {
-  const int num_boxes = box_ind.dimension(0);
-  for (int b = 0; b < num_boxes; ++b) {
-    OP_REQUIRES(context, box_ind(b) >= 0 && box_ind(b) < batch,
-                errors::OutOfRange("box_ind has values outside [0, batch)"));
-  }
-}
+}  // namespace functor
 
-#define REGISTER_KERNEL(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("CropAndResize")                    \
-                              .Device(DEVICE_CPU)                  \
-                              .TypeConstraint<T>("T")              \
-                              .HostMemory("crop_size"),            \
-                          CropAndResizeOp<CPUDevice, T>);          \
-                                                                   \
-  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradBoxes")           \
-                              .Device(DEVICE_CPU)                  \
-                              .TypeConstraint<T>("T"),             \
+#define REGISTER_KERNEL(T)                                \
+  REGISTER_KERNEL_BUILDER(Name("CropAndResize")           \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<T>("T")     \
+                              .HostMemory("crop_size"),   \
+                          CropAndResizeOp<CPUDevice, T>); \
+                                                          \
+  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradBoxes")  \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<T>("T"),    \
                           CropAndResizeGradBoxesOp<CPUDevice, T>);
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
@@ -634,50 +675,89 @@ TF_CALL_double(REGISTER_KERNEL);
 
 #if GOOGLE_CUDA
 
-// Forward declaration of the CheckValidBoxIndHelper specialization for GPU.
+// Forward declaration of the CheckValidBoxIndexHelper specialization for GPU.
 namespace functor {
 template <>
-void CheckValidBoxIndHelper<GPUDevice>::operator()(
-    const GPUDevice& d, typename TTypes<int32, 1>::ConstTensor box_ind,
-    int batch, typename TTypes<bool, 0>::Tensor isvalid);
-extern template struct CheckValidBoxIndHelper<GPUDevice>;
+void CheckValidBoxIndexHelper<GPUDevice>::operator()(
+    const GPUDevice& d, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, typename TTypes<bool, 0>::Tensor isvalid);
+extern template struct CheckValidBoxIndexHelper<GPUDevice>;
 }  // namespace functor
 
-// Specialization of CheckValidBoxInd for a GPUDevice.
+namespace {
+
+// Specialization of CheckValidBoxIndex for a GPUDevice.
 template <>
-inline void CheckValidBoxInd<GPUDevice>(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_ind,
-    int batch) {
-  const int num_boxes = box_ind.dimension(0);
+inline void RunIfBoxIndexIsValid<GPUDevice>(
+    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, const Callback& compute, const Callback& done) {
+  const int num_boxes = box_index.dimension(0);
   if (num_boxes == 0) {
+    compute();
+    done();
     return;
   }
-  Tensor isvalid_tensor;
-  OP_REQUIRES_OK(context,
-                 context->allocate_temp(DataTypeToEnum<bool>::value,
-                                        TensorShape({}), &isvalid_tensor));
 
-  typename TTypes<bool, 0>::Tensor isvalid = isvalid_tensor.tensor<bool, 0>();
+  Tensor isvalid_dev_tensor;
+  OP_REQUIRES_OK_ASYNC(
+      context,
+      context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
+                             &isvalid_dev_tensor),
+      done);
+  typename TTypes<bool, 0>::Tensor isvalid_dev =
+      isvalid_dev_tensor.tensor<bool, 0>();
 
-  functor::CheckValidBoxIndHelper<GPUDevice>()(
-      context->eigen_device<GPUDevice>(), box_ind, batch, isvalid);
+  // Run the actual box check on the device.
+  functor::CheckValidBoxIndexHelper<GPUDevice>()(
+      context->eigen_device<GPUDevice>(), box_index, batch_size, isvalid_dev);
 
+  // Copy the result back to the host.
   auto* stream = context->op_device_context()->stream();
-  OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
-  bool isvalid_host = false;
-  perftools::gputools::DeviceMemoryBase isvalid_gpu(isvalid.data(),
-                                                    sizeof(bool));
-  stream->ThenMemcpy(&isvalid_host, isvalid_gpu, sizeof(bool));
-  stream->BlockHostUntilDone();
-
-  OP_REQUIRES(context, stream->ok(),
-              errors::Internal("cudaMemcpy from device to host failed"));
-
-  OP_REQUIRES(context, isvalid_host,
-              errors::OutOfRange("box_ind has values outside [0, batch)"));
+  OP_REQUIRES_ASYNC(context, stream,
+                    errors::Internal("No GPU stream available."), done);
+  Tensor isvalid_host_tensor;
+  // Use pinned host memory on the host to avoid unnecessary
+  // synchronization.
+  AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(true);
+  alloc_attr.set_gpu_compatible(true);
+  OP_REQUIRES_OK_ASYNC(
+      context,
+      context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
+                             &isvalid_host_tensor, alloc_attr),
+      done);
+  perftools::gputools::DeviceMemoryBase wrapped(isvalid_dev.data(),
+                                                sizeof(bool));
+  const bool status =
+      stream
+          ->ThenMemcpy(
+              isvalid_host_tensor.scalar<bool>().data() /* destination */,
+              wrapped /* source */, sizeof(bool))
+          .ok();
+  OP_REQUIRES_ASYNC(
+      context, status,
+      errors::Internal("Failed to launch copy of isvalid from device to host."),
+      done);
+
+  // We capture both temporary tensors to prevent them from being deallocated
+  // when ComputeAsync returns and before the closure runs.
+  auto wrapped_callback = [context, isvalid_host_tensor, isvalid_dev_tensor,
+                           compute, done]() {
+    const bool isvalid = isvalid_host_tensor.scalar<bool>()();
+    OP_REQUIRES_ASYNC(
+        context, isvalid,
+        errors::OutOfRange("box_index has values outside [0, batch_size)"),
+        done);
+    compute();
+    done();
+  };
+
+  context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+      stream, wrapped_callback);
 }
 
+}  // namespace
+
 #define REGISTER_KERNEL(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("CropAndResize")                    \
                               .Device(DEVICE_GPU)                  \
diff --git a/tensorflow/core/kernels/crop_and_resize_op.h b/tensorflow/core/kernels/crop_and_resize_op.h
index 22df1bdd56bd0ef1d610fcb684c1987e0e32ed98..460dbad22b484f7df8cd10221f183df75ecffb55 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.h
+++ b/tensorflow/core/kernels/crop_and_resize_op.h
@@ -53,12 +53,12 @@ struct CropAndResizeBackpropBoxes {
 };
 
 template <typename Device>
-struct CheckValidBoxIndHelper {
-  // Checks if all values in box_ind are in [0, batch).
+struct CheckValidBoxIndexHelper {
+  // Checks if all values in box_index are in [0, batch).
   void operator()(const Device& d,
-                  typename TTypes<int32, 1>::ConstTensor box_ind, int batch,
+                  typename TTypes<int32, 1>::ConstTensor box_index, int batch,
                   typename TTypes<bool, 0>::Tensor isvalid) {
-    isvalid.device(d) = ((box_ind >= 0) && (box_ind < batch)).all();
+    isvalid.device(d) = ((box_index >= 0) && (box_index < batch)).all();
   }
 };
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
index 254475db4653242264a34d94aec61e3857b1f116..c1235fda89216fb535b51170dd4967fd5eddd7f0 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
@@ -440,7 +440,7 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 
-template struct CheckValidBoxIndHelper<GPUDevice>;
+template struct CheckValidBoxIndexHelper<GPUDevice>;
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 3a7f180598e2936b7dabbbb468533d962ad858ba..d6139dae966812261f6d59158ba807bbdfe40283 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -251,7 +251,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("box_ind has incompatible shape"))
+      StringPiece(s.ToString()).contains("box_index has incompatible shape"))
       << s;
 }
 
@@ -264,8 +264,10 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("box_ind has values outside [0, batch)"))
+                  .contains("box_index has values outside [0, batch_size)"))
       << s;
 }
 
+// TODO(zhengxq, rmlarsen): Add a benchmark.
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index 05d0169b112d1aa399fff8106d723e61dbfccf30..426382edeca7086676201cc86acd3b718c4bcb13 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -42,6 +42,8 @@ class CTCLossOp : public OpKernel {
                                      &preprocess_collapse_repeated_));
     OP_REQUIRES_OK(ctx,
                    ctx->GetAttr("ctc_merge_repeated", &ctc_merge_repeated_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ignore_longer_outputs_than_inputs",
+                                     &ignore_longer_outputs_than_inputs_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -150,12 +152,15 @@ class CTCLossOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctc_loss_calculator.CalculateLoss(
                             seq_len_t, labels_t, input_list_t,
                             preprocess_collapse_repeated_, ctc_merge_repeated_,
-                            &loss_t, &gradient_list_t, &workers));
+                            ignore_longer_outputs_than_inputs_, &loss_t,
+                            &gradient_list_t, &workers));
   }
 
  private:
   bool preprocess_collapse_repeated_;
   bool ctc_merge_repeated_;
+  bool ignore_longer_outputs_than_inputs_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CTCLossOp);
 };
 
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 3e59fb0a5ab1828665d93deb7731b44042d11969..b2d1a53af4b0cec297f94398d921a3e0c3126ac6 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <chrono>
 #include <complex>
+#include <unordered_map>
+#include <vector>
 
 #include "cuda/include/cublas_v2.h"
 #include "cuda/include/cusolverDn.h"
@@ -26,124 +28,173 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace {
 
-template <typename Scalar>
-class ScratchSpace {
- public:
-  explicit ScratchSpace(OpKernelContext* context, int size) {
-    TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<Scalar>::value,
-                                       TensorShape({size}), &scratch_tensor_));
-  }
-  Scalar* data() { return scratch_tensor_.template flat<Scalar>().data(); }
-
- private:
-  Tensor scratch_tensor_;
-};
+inline bool CopyHostToDevice(OpKernelContext* context, void* dst,
+                             const void* src, uint64 bytes) {
+  auto stream = context->op_device_context()->stream();
+  perftools::gputools::DeviceMemoryBase wrapped_dst(dst);
+  return stream->ThenMemcpy(&wrapped_dst, src, bytes).ok();
+}
 
 // Type traits to get CUDA complex types from std::complex<>.
-
 template <typename T>
 struct CUDAComplexT {
   typedef T type;
 };
-
 template <>
 struct CUDAComplexT<std::complex<float>> {
   typedef cuComplex type;
 };
-
 template <>
 struct CUDAComplexT<std::complex<double>> {
   typedef cuDoubleComplex type;
 };
-
 // Converts pointers of std::complex<> to pointers of
 // cuComplex/cuDoubleComplex. No type conversion for non-complex types.
-
 template <typename T>
 inline const typename CUDAComplexT<T>::type* CUDAComplex(const T* p) {
   return reinterpret_cast<const typename CUDAComplexT<T>::type*>(p);
 }
-
 template <typename T>
 inline typename CUDAComplexT<T>::type* CUDAComplex(T* p) {
   return reinterpret_cast<typename CUDAComplexT<T>::type*>(p);
 }
 
-// Converts values of std::complex<float/double> to values of
-// cuComplex/cuDoubleComplex.
-inline cuComplex CUDAComplexValue(std::complex<float> val) {
-  return {val.real(), val.imag()};
-}
+// A set of initialized handles to the underlying Cuda libraries used by
+// CudaSolver. We maintain one such set of handles per unique stream.
+struct CudaSolverHandles {
+  explicit CudaSolverHandles(cudaStream_t stream) {
+    CHECK(cusolverDnCreate(&cusolver_dn_handle) == CUSOLVER_STATUS_SUCCESS)
+        << "Failed to create cuSolverDN instance.";
+    CHECK(cusolverDnSetStream(cusolver_dn_handle, stream) ==
+          CUSOLVER_STATUS_SUCCESS)
+        << "Failed to set cuSolverDN stream.";
+    CHECK(cublasCreate(&cublas_handle) == CUBLAS_STATUS_SUCCESS)
+        << "Failed to create cuBlas instance.";
+    CHECK(cublasSetStream(cublas_handle, stream) == CUBLAS_STATUS_SUCCESS)
+        << "Failed to set cuBlas stream.";
+  }
 
-inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
-  return {val.real(), val.imag()};
+  ~CudaSolverHandles() {
+    CHECK(cublasDestroy(cublas_handle) == CUBLAS_STATUS_SUCCESS)
+        << "Failed to destroy cuBlas instance.";
+    CHECK(cusolverDnDestroy(cusolver_dn_handle) == CUSOLVER_STATUS_SUCCESS)
+        << "Failed to destroy cuSolverDN instance.";
+  }
+  cublasHandle_t cublas_handle;
+  cusolverDnHandle_t cusolver_dn_handle;
+};
+
+static mutex handle_map_mutex(LINKER_INITIALIZED);
+
+using HandleMap =
+    std::unordered_map<cudaStream_t, std::unique_ptr<CudaSolverHandles>>;
+
+// Returns a singleton map used for storing initialized handles for each unique
+// cuda stream.
+HandleMap* GetHandleMapSingleton() {
+  static HandleMap* cm = new HandleMap;
+  return cm;
 }
+
 }  // namespace
 
-#define TF_RETURN_IF_CUSOLVER_ERROR_MSG(expr, msg)             \
-  do {                                                         \
-    auto status = (expr);                                      \
-    if (TF_PREDICT_FALSE(status != CUSOLVER_STATUS_SUCCESS)) { \
-      return errors::Internal(msg);                            \
-    }                                                          \
+#define TF_RETURN_IF_CUSOLVER_ERROR(expr)                                      \
+  do {                                                                         \
+    auto status = (expr);                                                      \
+    if (TF_PREDICT_FALSE(status != CUSOLVER_STATUS_SUCCESS)) {                 \
+      return errors::Internal("cuSolverDN call failed with status =", status); \
+    }                                                                          \
   } while (0)
 
-#define TF_RETURN_IF_CUSOLVER_ERROR(expr) \
-  TF_RETURN_IF_CUSOLVER_ERROR_MSG(expr, "cuSolverDN call failed.")
-
-#define TF_RETURN_STATUS_FROM_INFO(method, device_info_ptr, info_ptr)     \
-  do {                                                                    \
-    int local_info;                                                       \
-    TF_RETURN_IF_ERROR(GetInfo(device_info_ptr, &local_info));            \
-    if (info_ptr != nullptr) *info_ptr = local_info;                      \
-    if (TF_PREDICT_FALSE(local_info != 0)) {                              \
-      return errors::Internal("cuSolverDN::" #method " returned info = ", \
-                              local_info, ", expected info = 0");         \
-    } else {                                                              \
-      return Status::OK();                                                \
-    }                                                                     \
+#define TF_RETURN_IF_CUBLAS_ERROR(expr)                                \
+  do {                                                                 \
+    auto status = (expr);                                              \
+    if (TF_PREDICT_FALSE(status != CUBLAS_STATUS_SUCCESS)) {           \
+      return errors::Internal("cuBlas call failed status = ", status); \
+    }                                                                  \
   } while (0)
 
-CudaSolverDN::CudaSolverDN(OpKernelContext* context) : context_(context) {
+CudaSolver::CudaSolver(OpKernelContext* context) : context_(context) {
   const cudaStream_t* cu_stream_ptr = CHECK_NOTNULL(
       reinterpret_cast<const cudaStream_t*>(context->op_device_context()
                                                 ->stream()
                                                 ->implementation()
                                                 ->CudaStreamMemberHack()));
   cuda_stream_ = *cu_stream_ptr;
-  CHECK(cusolverDnCreate(&handle_) == CUSOLVER_STATUS_SUCCESS)
-      << "Failed to create cuSolverDN instance.";
-  CHECK(cusolverDnSetStream(handle_, cuda_stream_) == CUSOLVER_STATUS_SUCCESS)
-      << "Failed to set cuSolverDN stream.";
+  HandleMap* handle_map = CHECK_NOTNULL(GetHandleMapSingleton());
+  mutex_lock lock(handle_map_mutex);
+  auto it = handle_map->find(cuda_stream_);
+  if (it == handle_map->end()) {
+    LOG(INFO) << "Creating CudaSolver handles for stream " << cuda_stream_;
+    // Previously unseen Cuda stream. Initialize a set of Cuda solver library
+    // handles for it.
+    std::unique_ptr<CudaSolverHandles> new_handles(
+        new CudaSolverHandles(cuda_stream_));
+    it =
+        handle_map->insert(std::make_pair(cuda_stream_, std::move(new_handles)))
+            .first;
+  }
+  cusolver_dn_handle_ = it->second->cusolver_dn_handle;
+  cublas_handle_ = it->second->cublas_handle;
 }
 
-CudaSolverDN::~CudaSolverDN() {
-  CHECK(cusolverDnDestroy(handle_) == CUSOLVER_STATUS_SUCCESS)
-      << "Failed to destroy cuSolverDN instance.";
-}
+Status CudaSolver::CopyLapackInfoToHostAsync(
+    const std::vector<DeviceLapackInfo>& dev_lapack_infos,
+    std::function<void(const Status&, const std::vector<HostLapackInfo>&)>
+        info_checker_callback) const {
+  std::vector<HostLapackInfo> host_lapack_infos;
+  if (dev_lapack_infos.empty()) {
+    info_checker_callback(Status::OK(), std::move(host_lapack_infos));
+    return Status::OK();
+  }
 
-Status CudaSolverDN::GetInfo(const int* dev_info, int* host_info) const {
-  CHECK(dev_info != nullptr);
-  CHECK(host_info != nullptr);
-  auto stream = context_->op_device_context()->stream();
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<int*>(dev_info));
-  if (!stream
-           ->ThenMemcpy(host_info /* destination */, wrapped /* source */,
-                        sizeof(int))
-           .ok()) {
-    return errors::Internal("Failed to copy dev_info to host.");
+  // Launch memcpys to copy info back from the device to the host.
+  for (const auto& dev_lapack_info : dev_lapack_infos) {
+    bool success = true;
+    auto host_copy = dev_lapack_info.CopyToHost(&success);
+    if (!success) {
+      return errors::Internal(
+          "Failed to launch copy of dev_lapack_info to host, debug_info = ",
+          dev_lapack_info.debug_info());
+    }
+    host_lapack_infos.push_back(std::move(host_copy));
   }
-  BlockingCounter barrier(1);
+
+  // This callback checks that all batch items in all calls were processed
+  // successfully and passes status to the info_checker_callback accordingly.
+  auto wrapped_info_checker_callback =
+      [info_checker_callback](std::vector<HostLapackInfo> host_lapack_infos) {
+        Status status;
+        for (auto host_lapack_info : host_lapack_infos) {
+          for (int i = 0; i < host_lapack_info.size() && status.ok(); ++i) {
+            const int info_value = (host_lapack_info.data())[i];
+            if (info_value != 0) {
+              status = errors::InvalidArgument(
+                  "Got info = ", info_value, " for batch index ", i,
+                  ", expected info = 0. Debug_info =",
+                  host_lapack_info.debug_info());
+            }
+          }
+          if (!status.ok()) {
+            break;
+          }
+        }
+        info_checker_callback(status, host_lapack_infos);
+      };
+  auto cb =
+      std::bind(wrapped_info_checker_callback, std::move(host_lapack_infos));
+  auto stream = context_->op_device_context()->stream();
   context_->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
-      stream, [&barrier]() { barrier.DecrementCount(); });
-  if (!barrier.WaitFor(std::chrono::minutes(1))) {
-    return errors::Internal("Failed to copy dev_info to host within 1 minute.");
-  }
+      stream, std::move(cb));
   return Status::OK();
 }
 
@@ -152,39 +203,134 @@ Status CudaSolverDN::GetInfo(const int* dev_info, int* host_info) const {
 #define TF_CALL_LAPACK_TYPES(m) \
   m(float, S) m(double, D) m(std::complex<float>, C) m(std::complex<double>, Z)
 
-// Macros to construct cusolver method names.
-#define SOLVER_NAME(method, lapack_prefix) cusolverDn##lapack_prefix##method
-#define BUFSIZE_NAME(method, lapack_prefix) \
+// Macros to construct cusolverDn method names.
+#define DN_SOLVER_FN(method, lapack_prefix) cusolverDn##lapack_prefix##method
+#define DN_SOLVER_NAME(method, lapack_prefix) \
+  "cusolverDn" #lapack_prefix #method
+#define DN_BUFSIZE_FN(method, lapack_prefix) \
   cusolverDn##lapack_prefix##method##_bufferSize
 
+// Macros to construct cublas method names.
+#define BLAS_SOLVER_FN(method, lapack_prefix) cublas##lapack_prefix##method
+#define BLAS_SOLVER_NAME(method, lapack_prefix) "cublas" #lapack_prefix #method
+
 //=============================================================================
 // Wrappers of cuSolverDN computational methods begin here.
+//
+// WARNING to implementers: The function signatures listed in the online docs
+// are sometimes inaccurate, e.g., are missing 'const' on pointers
+// to immutable arguments, while the actual headers have them as expected.
+// Check the actual declarations in the cusolver_api.h header file.
 //=============================================================================
-#define POTRF_INSTANCE(Scalar, lapack_prefix)                                 \
-  template <>                                                                 \
-  Status CudaSolverDN::potrf<Scalar>(cublasFillMode_t uplo, int n, Scalar* A, \
-                                     int lda, int* info) const {              \
-    /* Get amount of workspace memory required. */                            \
-    int lwork;                                                                \
-    TF_RETURN_IF_CUSOLVER_ERROR(BUFSIZE_NAME(potrf, lapack_prefix)(           \
-        handle_, uplo, n, CUDAComplex(A), lda, &lwork));                      \
-                                                                              \
-    /* Allocate device memory for workspace and info. */                      \
-    ScratchSpace<Scalar> device_workspace(context_, lwork);                   \
-    ScratchSpace<int> device_info(context_, 1);                               \
-                                                                              \
-    /* Launch the solver kernel. */                                           \
-    TF_RETURN_IF_CUSOLVER_ERROR(SOLVER_NAME(potrf, lapack_prefix)(            \
-        handle_, uplo, n, CUDAComplex(A), lda,                                \
-        CUDAComplex(device_workspace.data()), lwork, device_info.data()));    \
-                                                                              \
-    /* Get info from device and return status. */                             \
-    TF_RETURN_STATUS_FROM_INFO(potrf, device_info.data(), info);              \
-    return Status::OK();                                                      \
+template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
+static inline Status PotrfImpl(BufSizeFnT bufsize, SolverFnT solver,
+                               OpKernelContext* context,
+                               cusolverDnHandle_t cusolver_dn_handle,
+                               cublasFillMode_t uplo, int n, Scalar* A, int lda,
+                               int* dev_lapack_info) {
+  /* Get amount of workspace memory required. */
+  int lwork;
+  TF_RETURN_IF_CUSOLVER_ERROR(
+      bufsize(cusolver_dn_handle, uplo, n, CUDAComplex(A), lda, &lwork));
+  /* Allocate device memory for workspace. */
+  ScratchSpace<Scalar> dev_workspace(context, lwork, /* on_host */ false);
+  /* Launch the solver kernel. */
+  TF_RETURN_IF_CUSOLVER_ERROR(solver(
+      cusolver_dn_handle, uplo, n, CUDAComplex(A), lda,
+      CUDAComplex(dev_workspace.mutable_data()), lwork, dev_lapack_info));
+  return Status::OK();
+}
+
+#define POTRF_INSTANCE(Scalar, lapack_prefix)                                \
+  template <>                                                                \
+  Status CudaSolver::Potrf<Scalar>(cublasFillMode_t uplo, int n, Scalar* A,  \
+                                   int lda, int* dev_lapack_info) const {    \
+    return PotrfImpl(DN_BUFSIZE_FN(potrf, lapack_prefix),                    \
+                     DN_SOLVER_FN(potrf, lapack_prefix), context_,           \
+                     cusolver_dn_handle_, uplo, n, A, lda, dev_lapack_info); \
   }
 
 TF_CALL_LAPACK_TYPES(POTRF_INSTANCE);
 
+//=============================================================================
+// Wrappers of cuBlas computational methods begin here.
+//
+// WARNING to implementers: The function signatures listed in the online docs
+// are sometimes inaccurate, e.g., are missing 'const' on pointers
+// to immutable arguments, while the actual headers have them as expected.
+// Check the actual declarations in the cublas_api.h header file.
+//=============================================================================
+template <typename Scalar, typename SolverFnT>
+static inline Status GetrfBatchedImpl(
+    SolverFnT solver, OpKernelContext* context, cublasHandle_t cublas_handle,
+    int n, const Scalar* host_a_dev_ptrs[], int lda, int* dev_pivots,
+    DeviceLapackInfo* dev_lapack_info, int batch_size) {
+  using CudaScalar = typename CUDAComplexT<Scalar>::type;
+  ScratchSpace<uint8> dev_a_dev_ptrs(context, sizeof(CudaScalar*) * batch_size,
+                                     /* on_host */ false);
+  if (!CopyHostToDevice(
+          context, (void*)dev_a_dev_ptrs.mutable_data() /* dest */,
+          (const void*)host_a_dev_ptrs /* source */, dev_a_dev_ptrs.bytes())) {
+    return errors::Internal("GetrfBatched: failed to copy pointers to device");
+  }
+  TF_RETURN_IF_CUBLAS_ERROR(
+      solver(cublas_handle, n, (CudaScalar**)dev_a_dev_ptrs.mutable_data(), lda,
+             dev_pivots, dev_lapack_info->mutable_data(), batch_size));
+  return Status::OK();
+}
+
+#define GETRF_BATCHED_INSTANCE(Scalar, lapack_prefix)                          \
+  template <>                                                                  \
+  Status CudaSolver::GetrfBatched(                                             \
+      int n, const Scalar* host_a_dev_ptrs[], int lda, int* dev_pivots,        \
+      DeviceLapackInfo* dev_lapack_info, int batch_size) const {               \
+    return GetrfBatchedImpl(BLAS_SOLVER_FN(getrfBatched, lapack_prefix),       \
+                            context_, cublas_handle_, n, host_a_dev_ptrs, lda, \
+                            dev_pivots, dev_lapack_info, batch_size);          \
+  }
+
+TF_CALL_LAPACK_TYPES(GETRF_BATCHED_INSTANCE);
+
+template <typename Scalar, typename SolverFnT>
+static inline Status GetriBatchedImpl(
+    SolverFnT solver, OpKernelContext* context, cublasHandle_t cublas_handle,
+    int n, const Scalar* host_a_dev_ptrs[], int lda, const int* dev_pivots,
+    const Scalar* host_a_inv_dev_ptrs[], int ldainv,
+    DeviceLapackInfo* dev_lapack_info, int batch_size) {
+  using CudaScalar = typename CUDAComplexT<Scalar>::type;
+  ScratchSpace<uint8> dev_a_dev_ptrs(context, sizeof(CudaScalar*) * batch_size,
+                                     /* on_host */ false);
+  ScratchSpace<uint8> dev_a_inv_dev_ptrs(
+      context, sizeof(CudaScalar*) * batch_size, /* on_host */ false);
+  if (!CopyHostToDevice(
+          context, (void*)dev_a_dev_ptrs.mutable_data() /* dest */,
+          (const void*)host_a_dev_ptrs /* source */, dev_a_dev_ptrs.bytes()) ||
+      !CopyHostToDevice(context, (void*)dev_a_inv_dev_ptrs.mutable_data(),
+                        (const void*)host_a_inv_dev_ptrs,
+                        dev_a_inv_dev_ptrs.bytes())) {
+    return errors::Internal("GetriBatched: failed to copy pointers to device");
+  }
+  TF_RETURN_IF_CUBLAS_ERROR(
+      solver(cublas_handle, n, (const CudaScalar**)dev_a_dev_ptrs.data(), lda,
+             dev_pivots, (CudaScalar**)dev_a_inv_dev_ptrs.mutable_data(),
+             ldainv, dev_lapack_info->mutable_data(), batch_size));
+  return Status::OK();
+}
+
+#define GETRI_BATCHED_INSTANCE(Scalar, lapack_prefix)                          \
+  template <>                                                                  \
+  Status CudaSolver::GetriBatched(                                             \
+      int n, const Scalar* host_a_dev_ptrs[], int lda, const int* dev_pivots,  \
+      const Scalar* host_a_inv_dev_ptrs[], int ldainv,                         \
+      DeviceLapackInfo* dev_lapack_info, int batch_size) const {               \
+    return GetriBatchedImpl(BLAS_SOLVER_FN(getriBatched, lapack_prefix),       \
+                            context_, cublas_handle_, n, host_a_dev_ptrs, lda, \
+                            dev_pivots, host_a_inv_dev_ptrs, ldainv,           \
+                            dev_lapack_info, batch_size);                      \
+  }
+
+TF_CALL_LAPACK_TYPES(GETRI_BATCHED_INSTANCE);
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index eeb179cfa664342cd684640526f486f40d82d1b4..bed1962ad00cb0b76339edea369d2897631b5161 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -14,164 +14,304 @@ limitations under the License.
 ==============================================================================
 */
 
-// This header implements CudaSolverDN and CuBlas, which contain templatized
-// wrappers of linear algebra solvers in the cuBlas and cuSolverDN libraries
-// for use in TensorFlow kernels.
+// This header declares the class CudaSolver, which contains wrappers of linear
+// algebra solvers in the cuBlas and cuSolverDN libraries for use in TensorFlow
+// kernels.
 
 #ifdef GOOGLE_CUDA
 
+#include <functional>
+#include <vector>
+
 #include "cuda/include/cublas_v2.h"
 #include "cuda/include/cusolverDn.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
 
-// A class that provides a simplified templated API for the solver methods
-// in cuSolverDN (http://docs.nvidia.com/cuda/cusolver).
-// An object of this class wraps a cuSolverDN instance, and will launch
-// kernels on the cuda stream wrapped by the GPU device in the OpKernelContext
-// provided to the constructor. The class methods transparently fetch the output
-// status of the solvers (a.k.a. the LAPACK "info" output variable) without
-// having to manually synchronize the underlying Cuda stream.
-class CudaSolverDN {
+// Container of LAPACK info data (an array of int) generated on-device by
+// a CudaSolver call. One or more such objects can be passed to
+// CudaSolver::CopyLapackInfoToHostAsync() along with a callback to
+// check the LAPACK info data after the corresponding kernels
+// finish and LAPACK info has been copied from the device to the host.
+class DeviceLapackInfo;
+
+// Host-side copy of LAPACK info.
+class HostLapackInfo;
+
+// The CudaSolver class provides a simplified templated API for the dense linear
+// solvers implemented in cuSolverDN (http://docs.nvidia.com/cuda/cusolver) and
+// cuBlas (http://docs.nvidia.com/cuda/cublas/#blas-like-extension/).
+// An object of this class wraps static cuSolver and cuBlas instances,
+// and will launch Cuda kernels on the stream wrapped by the GPU device
+// in the OpKernelContext provided to the constructor.
+//
+// Notice: All the computational member functions are asynchronous and simply
+// launch one or more Cuda kernels on the Cuda stream wrapped by the CudaSolver
+// object. To check the final status of the kernels run, call
+// CopyLapackInfoToHostAsync() on the CudaSolver object to set a callback that
+// will be invoked with the status of the kernels launched thus far as
+// arguments.
+//
+// Example of an asynchronous TensorFlow kernel using CudaSolver:
+//
+// template <typename Scalar>
+// class SymmetricPositiveDefiniteSolveOpGpu : public AsyncOpKernel {
+//  public:
+//   explicit SymmetricPositiveDefiniteSolveOpGpu(OpKernelConstruction* context)
+//       : AsyncOpKernel(context) { }
+//   void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+//     // 1. Set up input and output device ptrs. See, e.g.,
+//     // matrix_inverse_op.cc for a full example.
+//     ...
+//
+//     // 2. Initialize the solver object.
+//     CudaSolver solver(context);
+//
+//     // 3. Launch the two compute kernels back to back on the stream without
+//     // synchronizing.
+//     std::vector<DeviceLapackInfo> dev_info;
+//     const int batch_size = 1;
+//     dev_info.emplace_back(context, batch_size, "potrf");
+//     // Compute the Cholesky decomposition of the input matrix.
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver.Potrf(uplo, n, dev_matrix_ptrs, n,
+//                                       dev_info.back().mutable_data()),
+//                          done);
+//     dev_info.emplace_back(context, batch_size, "potrs");
+//     // Use the Cholesky decomposition of the input matrix to solve A X = RHS.
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver.Potrs(uplo, n, nrhs, dev_matrix_ptrs, n,
+//                                       dev_output_ptrs, ldrhs,
+//                                       dev_info.back().mutable_data()),
+//                          done);
+//
+//     // 4. Check the status after the computation finishes and call done.
+//     auto check_status = [context, done](const Status& status,
+//       const std::vector<HostLapackInfo>& /* unused */) {
+//           // In this example we don't care about the exact cause of
+//           // death, so just check status.
+//           OP_REQUIRES_OK_ASYNC(context, status, done);
+//           done();
+//     };
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver.CopyLapackInfoToHostAsync(
+//                            dev_info, std::move(check_status));
+//                          done);
+//   }
+// };
+
+class CudaSolver {
  public:
-  explicit CudaSolverDN(OpKernelContext* context);
-  virtual ~CudaSolverDN();
+  // This object stores a pointer to context, which must outlive it.
+  explicit CudaSolver(OpKernelContext* context);
+  virtual ~CudaSolver() {}
+
+  // Launches a memcpy of solver status data specified by dev_lapack_info from
+  // device to the host, and asynchronously invokes the given callback when the
+  // copy is complete. The first Status argument to the callback will be
+  // Status::OK if all lapack infos retrived are zero, otherwise an error status
+  // is given. The second argument contains a host-side copy of the entire set
+  // of infos retrieved, and can be used for generating detailed error messages.
+  Status CopyLapackInfoToHostAsync(
+      const std::vector<DeviceLapackInfo>& dev_lapack_info,
+      std::function<void(const Status&, const std::vector<HostLapackInfo>&)>
+          info_checker_callback) const;
 
   // ====================================================================
-  // Templated wrappers for cuSolver functions start here.
-
-  // Cholesky factorization.
-  // Computes Cholesky factorization A = L * L^T.
-  // Returns Status::OK(), if the Cholesky factorization was successful.
-  // If info is not nullptr it is used to return the potrf info code:
-  // Returns zero if success, returns -i if the
-  // i-th parameter is wrong, returns i > 0, if the leading minor of order i is
-  // not positive definite, see:
+  // Wrappers for cuSolverDN and cuBlas solvers start here.
+  //
+  // Apart from capitalization of the first letter, the method names below map
+  // to those in cuSolverDN and cuBlas, which follow the naming convention in
+  // LAPACK see, e.g., http://docs.nvidia.com/cuda/cusolver/#naming-convention
+
+  // Computes the Cholesky factorization A = L * L^T for a single matrix.
+  // Returns Status::OK(), if the kernel was launched successfully. See:
   // http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrf
   template <typename Scalar>
-  Status potrf(cublasFillMode_t uplo, int n, Scalar* A, int lda,
-               int* info) const;
+  Status Potrf(cublasFillMode_t uplo, int n, Scalar* dev_A, int lda,
+               int* dev_lapack_info) const;
+
+  // Computes partially pivoted LU factorizations for a batch of matrices.
+  // Returns Status::OK() if the kernel was launched successfully.See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
+  template <typename Scalar>
+  Status GetrfBatched(int n, const Scalar* host_a_dev_ptrs[], int lda,
+                      int* dev_pivots, DeviceLapackInfo* dev_lapack_info,
+                      int batch_size) const;
+
+  // Computes matrix inverses for a batch of matrices. Uses the outputs from
+  // GetrfBatched. Returns Status::OK() if the kernel was launched successfully.
+  // See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getribatched
+  template <typename Scalar>
+  Status GetriBatched(int n, const Scalar* host_a_dev_ptrs[], int lda,
+                      const int* dev_pivots,
+                      const Scalar* host_a_inverse_dev_ptrs[], int ldainv,
+                      DeviceLapackInfo* dev_lapack_info, int batch_size) const;
 
   /*
   TODO(rmlarsen, volunteers): Implement the kernels below.
   // Uses Cholesky factorization to solve A * X = B.
   // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrs
   template <typename Scalar>
-  Status potrs(cublasFillMode_t uplo, int n, int nrhs, const Scalar* A, int lda,
-             Scalar* B, int ldb, int* info) const;
+  Status Potrs(cublasFillMode_t uplo, int n, int nrhs, const Scalar* dev_A, int
+  lda, Scalar* dev_B, int ldb, int* dev_lapack_info) const;
 
   // LU factorization.
   // Computes LU factorization with partial pivoting P * A = L * U.
   // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-getrf
   template <typename Scalar>
-  Status getrf(int m, int n, Scalar* A, int lda, int* devIpiv,
-             int* devInfo) const;
+  Status Getrf(int m, int n, Scalar* dev_A, int lda, int* dev_pivots,
+             int* dev_lapack_info) const;
 
   // Uses LU factorization to solve A * X = B.
   // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-getrs
   template <typename Scalar>
-  Status getrs(int n, int nrhs, const Scalar* A, int lda, const int* devIpiv,
-             Scalar* B, int ldb, int* devInfo) const;
+  Status Getrs(int n, int nrhs, const Scalar* dev_A, int lda, const int*
+  dev_pivots, Scalar* dev_B, int ldb, int* dev_lapack_info) const;
 
   // QR factorization.
   // Computes QR factorization A = Q * R.
   // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-geqrf
   template <typename Scalar>
-  Status geqrf(int m, int n, Scalar* A, int lda, Scalar* TAU, int* devInfo)
-  const;
+  Status Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_TAU, int*
+  devInfo) const;
 
   // Multiplies by Q.
   // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-ormqr
   template <typename Scalar>
-  Status mqr(cublasSideMode_t side, cublasOperation_t trans, int m, int n, int
-  k, const Scalar* A, int lda, const Scalar* tau, Scalar* C, int ldc, int*
-  devInfo const);
+  Status Ormqr(cublasSideMode_t side, cublasOperation_t trans, int m, int n, int
+  k, const Scalar* dev_a, int lda, const Scalar* dev_tau, Scalar* dev_c, int
+  ldc, int* dev_lapack_info) const;
 
-  // Materializes Q.
+  // Generate Q.
   // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-orgqr
   template <typename Scalar>
-  Status gqr(int m, int n, int k, Scalar* A, int lda, const Scalar* tau,
-           int* devInfo) const;
+  Status Orgqr(int m, int n, int k, Scalar* dev_A, int lda, const Scalar*
+  dev_tau, int* dev_lapack_info) const;
 
   // Symmetric/Hermitian Eigen decomposition.
   // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-syevd
   template <typename Scalar>
-  Status evd(cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, Scalar* A,
-           int lda, Scalar* W, int* devInfo) const;
+  Status Syevd(cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, Scalar*
+  dev_A, int lda, Scalar* dev_W, int* dev_lapack_info) const;
 
   // Singular value decomposition.
   // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-gesvd
   template <typename Scalar>
-  Status gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* A,
-             int lda, Scalar* S, Scalar* U, int ldu, Scalar* VT, int ldvt,
-             int* devInfo);
-*/
+  Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
+             int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
+             int ldvt, int* dev_lapack_info);
 
- private:
-  // Copies dev_info status back from the device to host and uses event manager
-  // to wait (with a timeout) until the copy has finished. Returns an error if
-  // the copy fails to complete successfully within the timeout period.
-  Status GetInfo(const int* dev_info, int* host_info) const;
+  // Batched linear solver using LU factorization from getrfBatched.
+  // See:
+  http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched
+  template <typename Scalar>
+  Status GetrsBatched(cublasOperation_t trans, int n, int nrhs,
+                    const Scalar* dev_Aarray[], int lda, const int* devIpiv,
+                    Scalar* dev_Barray[], int ldb, int* info, int batch_size)
+  const;
+  */
 
+ private:
   OpKernelContext* context_;  // not owned.
   cudaStream_t cuda_stream_;
-  cusolverDnHandle_t handle_;
+  cusolverDnHandle_t cusolver_dn_handle_;
+  cublasHandle_t cublas_handle_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CudaSolver);
 };
 
-/*
-  TODO(rmlarsen, volunteers): Implement the kernels below. These are utils and
-batched solvers not currently wrapped by stream executor. class CudaBlas {
+// Helper class to allocate scratch memory and keep track of debug info.
+// Mostly a thin wrapper around Tensor.
+template <typename Scalar>
+class ScratchSpace {
  public:
-  // Initializes a cuSolverDN handle that will launch kernels using the
-  // cuda stream wrapped by the GPU device in context.
-  explicit CudaBlas(OpKernelContext* context);
-  virtual ~CudaBlas();
+  ScratchSpace(OpKernelContext* context, int size, bool on_host)
+      : ScratchSpace(context, size, "", on_host) {}
 
-  // Templatized wrappers for cuBlas functions.
+  ScratchSpace(OpKernelContext* context, int size, const string& debug_info,
+               bool on_host)
+      : context_(context), debug_info_(debug_info), on_host_(on_host) {
+    AllocatorAttributes alloc_attr;
+    if (on_host) {
+      // Allocate pinned memory on the host to avoid unnecessary
+      // synchronization.
+      alloc_attr.set_on_host(true);
+      alloc_attr.set_gpu_compatible(true);
+    }
+    TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<Scalar>::value,
+                                       TensorShape({size}), &scratch_tensor_,
+                                       alloc_attr));
+  }
 
-  // Matrix addition, copy and transposition.
-  // See: http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-geam
-  template <typename Scalar>
-  Status geam(cublasOperation_t transa, cublasOperation_t transb, int m, int n,
-            const Scalar* alpha, const Scalar* A, int lda, const Scalar* beta,
-            const Scalar* B, int ldb, Scalar* C, int ldc) const;
+  virtual ~ScratchSpace() {}
 
-  // Batched LU fatorization.
-  // See:
-http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
-  template <typename Scalar>
-  Status getrfBatched(int n, Scalar* Aarray[], int lda, int* PivotArray,
-                    int* infoArray, int batchSize) const;
+  Scalar* mutable_data() {
+    return scratch_tensor_.template flat<Scalar>().data();
+  }
+  const Scalar* data() const {
+    return scratch_tensor_.template flat<Scalar>().data();
+  }
+  int64 bytes() const { return scratch_tensor_.TotalBytes(); }
+  int64 size() const { return scratch_tensor_.NumElements(); }
+  const string& debug_info() const { return debug_info_; }
 
-  // Batched linear solver using LU factorization from getrfBatched.
-  // See:
-http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched
-  template <typename Scalar>
-  Status getrsBatched(cublasOperation_t trans, int n, int nrhs,
-                    const Scalar* Aarray[], int lda, const int* devIpiv,
-                    Scalar* Barray[], int ldb, int* info, int batchSize) const;
+  // Returns true if this ScratchSpace is in host memory.
+  bool on_host() const { return on_host_; }
 
-  // Batched matrix inverse.
-  // See:
-http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getribatched
-  template <typename Scalar>
-  Status getriBatched(cublasHandle_t handle, int n, Scalar* Aarray[], int lda,
-                    int* PivotArray, Scalar* Carray[], int ldc, int* infoArray,
-                    int batchSize);
+ protected:
+  OpKernelContext* context() const { return context_; }
 
  private:
-  // Copies dev_info status back from the device to host and uses event manager
-  // to wait (with a timeout) until the copy has finished. Returns an error if
-  // the copy fails to complete successfully within the timeout period.
-  Status GetInfo(const int* dev_info, int* host_info) const;
+  OpKernelContext* context_;  // not owned
+  const string debug_info_;
+  const bool on_host_;
+  Tensor scratch_tensor_;
+};
 
-  OpKernelContext* context_;  // not owned.
-  cudaStream_t cuda_stream_;
-  cublasHandle_t handle_;
+class HostLapackInfo : public ScratchSpace<int> {
+ public:
+  HostLapackInfo(OpKernelContext* context, int size, const string& debug_info)
+      : ScratchSpace<int>(context, size, debug_info, /* on_host */ true){};
 };
-*/
+
+class DeviceLapackInfo : public ScratchSpace<int> {
+ public:
+  DeviceLapackInfo(OpKernelContext* context, int size, const string& debug_info)
+      : ScratchSpace<int>(context, size, debug_info, /* on_host */ false) {}
+
+  // Allocates a new scratch space on the host and launches a copy of the
+  // contents of *this to the new scratch space. Sets success to true if
+  // the copy kernel was launched successfully.
+  HostLapackInfo CopyToHost(bool* success) const {
+    CHECK(success != nullptr);
+    HostLapackInfo copy(context(), size(), debug_info());
+    auto stream = context()->op_device_context()->stream();
+    perftools::gputools::DeviceMemoryBase wrapped_src(
+        static_cast<void*>(const_cast<int*>(this->data())));
+    *success =
+        stream->ThenMemcpy(copy.mutable_data(), wrapped_src, this->bytes())
+            .ok();
+    return copy;
+  }
+};
+
+namespace functor {
+// Helper functor to transpose and conjugate all matrices in a flattened batch.
+template <typename Device, typename Scalar>
+struct AdjointBatchFunctor {
+  // We assume that the tensor sizes are correct.
+  void operator()(const Device& d,
+                  typename TTypes<Scalar, 3>::ConstTensor input,
+                  typename TTypes<Scalar, 3>::Tensor output);
+};
+}  // namespace functor
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cuda_solvers_gpu.cu.cc b/tensorflow/core/kernels/cuda_solvers_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d32f506557792926797b18e49db869ff94cdd623
--- /dev/null
+++ b/tensorflow/core/kernels/cuda_solvers_gpu.cu.cc
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/cuda_solvers.h"
+
+#include <complex>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// TODO(rmlarsen): Add a faster custom kernel similar to
+// SwapDimension1And2InTensor3 in tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+template <typename Scalar>
+struct AdjointBatchFunctor<GPUDevice, Scalar> {
+  void operator()(const GPUDevice& d,
+                  typename TTypes<Scalar, 3>::ConstTensor input,
+                  typename TTypes<Scalar, 3>::Tensor output) {
+    const Eigen::array<int, 3> perm({0, 2, 1});
+    To32Bit(output).device(d) = To32Bit(input).shuffle(perm).conjugate();
+  }
+};
+
+// Instantiate implementations for the 4 numeric types.
+template struct AdjointBatchFunctor<GPUDevice, float>;
+template struct AdjointBatchFunctor<GPUDevice, double>;
+template struct AdjointBatchFunctor<GPUDevice, std::complex<float>>;
+template struct AdjointBatchFunctor<GPUDevice, std::complex<double>>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_atan2.cc b/tensorflow/core/kernels/cwise_op_atan2.cc
index 5232737857a001d45424de6d634969d146a90574..68f67c444ef1b6ed905c8107838b2c50f542256e 100644
--- a/tensorflow/core/kernels/cwise_op_atan2.cc
+++ b/tensorflow/core/kernels/cwise_op_atan2.cc
@@ -20,4 +20,4 @@ REGISTER2(BinaryOp, CPU, "Atan2", functor::atan2, float, double);
 #if GOOGLE_CUDA
 REGISTER2(BinaryOp, GPU, "Atan2", functor::atan2, float, double);
 #endif
-}  // namespace tensorflow
\ No newline at end of file
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc
index 0f327eaf6cf686ab1575a733450753e904609183..137e14ef840e0d3731d69513e87fdb48b13e53fb 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc
@@ -23,4 +23,4 @@ DEFINE_BINARY2(atan2, float, double);
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
\ No newline at end of file
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index 495db92ef625b96d6dbf4b9e2dd6c6169e67a285..487f045cc8d6066107dbc2919cb18db73b27aeab 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -36,7 +36,7 @@ namespace tensorflow {
 
 class DebugIdentityOpTest : public OpsTestBase {
  protected:
-  Status Init(DataType input_type, const std::vector<string> debug_urls) {
+  Status Init(DataType input_type, const std::vector<string>& debug_urls) {
     env_ = Env::Default();
 
     TF_CHECK_OK(NodeDefBuilder("op", "DebugIdentity")
diff --git a/tensorflow/core/kernels/decode_gif_op.cc b/tensorflow/core/kernels/decode_gif_op.cc
deleted file mode 100644
index 2bc17f8a309972f55206aa55fffd215678401fad..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/decode_gif_op.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gif/gif_io.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a GIF file
-class DecodeGifOp : public OpKernel {
- public:
-  explicit DecodeGifOp(OpKernelConstruction* context) : OpKernel(context) {}
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-
-    // Start decoding image to get shape details
-    const StringPiece input = contents.scalar<string>()();
-
-    // Decode image, allocating tensor once the image size is known
-    Tensor* output = nullptr;
-    OP_REQUIRES(
-        context,
-        gif::Decode(input.data(), input.size(),
-                    [=, &output](int num_frames, int width, int height,
-                                 int channels) -> uint8* {
-                      Status status(context->allocate_output(
-                          0, TensorShape({num_frames, height, width, channels}),
-                          &output));
-                      if (!status.ok()) {
-                        VLOG(1) << status;
-                        context->SetStatus(status);
-                        return nullptr;
-                      }
-                      return output->flat<uint8>().data();
-                    }),
-        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeGifOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76f8c225432dd7ddb36933722f3cf0c9404c48ad
--- /dev/null
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -0,0 +1,315 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
+#include "tensorflow/core/lib/png/png_io.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace {
+
+enum FileFormat {
+  kUnknownFormat = 0,
+  kPngFormat = 1,
+  kJpgFormat = 2,
+  kGifFormat = 3,
+};
+
+// Classify the contents of a file based on starting bytes (the magic number).
+FileFormat ClassifyFileFormat(StringPiece data) {
+  // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
+  if (data.starts_with("\xff\xd8\xff")) return kJpgFormat;
+  if (data.starts_with("\x89PNG\r\n\x1a\n")) return kPngFormat;
+  if (data.starts_with("\x47\x49\x46\x38")) return kGifFormat;
+  return kUnknownFormat;
+}
+
+string FileFormatString(FileFormat magic, StringPiece data) {
+  switch (magic) {
+    case kPngFormat:
+      return "PNG";
+    case kJpgFormat:
+      return "JPEG";
+    case kGifFormat:
+      return "GIF";
+    default: {
+      if (data.empty()) return "empty file";
+      return strings::StrCat("unknown format starting with '",
+                             str_util::CEscape(data.substr(0, 16)), "'");
+    }
+  }
+}
+
+// Decode an image (either jpeg, png, or gif).  We use a single op so that
+// users don't have to care about which format they have.
+class DecodeImageOp : public OpKernel {
+ public:
+  explicit DecodeImageOp(OpKernelConstruction* context) : OpKernel(context) {
+    // Determine which op we are: jpeg, png, gif, or any
+    if (type_string() == "DecodeJpeg") {
+      format_ = kJpgFormat;
+    } else if (type_string() == "DecodePng") {
+      format_ = kPngFormat;
+    } else if (type_string() == "DecodeGif") {
+      format_ = kGifFormat;
+    } else {
+      OP_REQUIRES_OK(context,
+                     errors::InvalidArgument("Bad op type ", type_string()));
+    }
+
+    if (format_ == kGifFormat) {
+      channels_ = 3;
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+      OP_REQUIRES(
+          context,
+          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
+                                  channels_));
+    }
+    flags_.components = channels_;
+
+    // In the case of png, we support uint16 output
+    if (format_ == kPngFormat) {
+      DataType dt;
+      OP_REQUIRES_OK(context, context->GetAttr("dtype", &dt));
+      OP_REQUIRES(
+          context, dt == DataType::DT_UINT8 || dt == DataType::DT_UINT16,
+          errors::InvalidArgument("Type must be uint8 or uint16, got ", dt));
+      if (dt == DataType::DT_UINT8) {
+        channel_bits_ = 8;
+      } else {
+        channel_bits_ = 16;
+      }
+    }
+
+    // The TensorFlow-chosen default for jpeg decoding is IFAST, sacrificing
+    // image quality for speed.
+    flags_.dct_method = JDCT_IFAST;
+
+    if (format_ == kJpgFormat) {
+      OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
+      OP_REQUIRES(context,
+                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
+                      flags_.ratio == 8,
+                  errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
+                                          flags_.ratio));
+      OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
+                                               &flags_.fancy_upscaling));
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("try_recover_truncated",
+                                      &flags_.try_recover_truncated_jpeg));
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("acceptable_fraction",
+                                      &flags_.min_acceptable_fraction));
+
+      string dct_method;
+      OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
+      OP_REQUIRES(
+          context,
+          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+           dct_method == "INTEGER_ACCURATE"),
+          errors::InvalidArgument("dct_method must be one of "
+                                  "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
+      if (dct_method == "INTEGER_FAST") {
+        flags_.dct_method = JDCT_IFAST;
+      } else if (dct_method == "INTEGER_ACCURATE") {
+        flags_.dct_method = JDCT_ISLOW;
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().DebugString()));
+
+    // Determine format
+    const StringPiece input = contents.scalar<string>()();
+    const auto magic = ClassifyFileFormat(input);
+    OP_REQUIRES(
+        context,
+        magic == kJpgFormat || magic == kPngFormat || magic == kGifFormat,
+        errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
+                                FileFormatString(magic, input)));
+    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
+                errors::InvalidArgument(
+                    FileFormatString(magic, input),
+                    " contents are too large for int: ", input.size()));
+    OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
+                errors::InvalidArgument(FileFormatString(magic, input),
+                                        " does not support uint16 output"));
+
+    switch (magic) {
+      case kJpgFormat:
+        DecodeJpeg(context, input);
+        break;
+      case kPngFormat:
+        DecodePng(context, input);
+        break;
+      case kGifFormat:
+        DecodeGif(context, input);
+        break;
+      default:
+        LOG(FATAL) << "Should never get here after check above";
+        break;
+    }
+  }
+
+  void DecodeJpeg(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3,
+                errors::InvalidArgument(
+                    "channels must be 0, 1, or 3 for JPEG, got ", channels_));
+
+    // Decode jpeg, allocating tensor once the size is known
+    Tensor* output = nullptr;
+    OP_REQUIRES(
+        context,
+        jpeg::Uncompress(
+            input.data(), input.size(), flags_, nullptr /* nwarn */,
+            [=, &output](int width, int height, int channels) -> uint8* {
+              Status status(context->allocate_output(
+                  0,
+                  format_ == kGifFormat
+                      ? TensorShape({1, height, width, channels})
+                      : TensorShape({height, width, channels}),
+                  &output));
+              if (!status.ok()) {
+                VLOG(1) << status;
+                context->SetStatus(status);
+                return nullptr;
+              }
+              return output->flat<uint8>().data();
+            }),
+        errors::InvalidArgument("Invalid JPEG data, size ", input.size()));
+  }
+
+  void DecodePng(OpKernelContext* context, StringPiece input) {
+    // Start decoding png to get shape details
+    png::DecodeContext decode;
+    OP_REQUIRES(context,
+                png::CommonInitDecode(input, channels_, channel_bits_, &decode),
+                errors::InvalidArgument("Invalid PNG header, data size ",
+                                        input.size()));
+
+    // Verify that width and height are not too large:
+    // - verify width and height don't overflow int.
+    // - width can later be multiplied by channels_ and sizeof(uint16), so
+    //   verify single dimension is not too large.
+    // - verify when width and height are multiplied together, there are a few
+    //   bits to spare as well.
+    const int width = static_cast<int>(decode.width);
+    const int height = static_cast<int>(decode.height);
+    const int64 total_size =
+        static_cast<int64>(width) * static_cast<int64>(height);
+    if (width != static_cast<int64>(decode.width) || width <= 0 ||
+        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
+        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
+      png::CommonFreeDecode(&decode);
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("PNG size too large for int: ",
+                                          decode.width, " by ", decode.height));
+    }
+
+    // Allocate tensor
+    Tensor* output = nullptr;
+    const auto status = context->allocate_output(
+        0,
+        format_ == kGifFormat ? TensorShape({1, height, width, decode.channels})
+                              : TensorShape({height, width, decode.channels}),
+        &output);
+    if (!status.ok()) png::CommonFreeDecode(&decode);
+    OP_REQUIRES_OK(context, status);
+
+    if (channel_bits_ == 8) {
+      // Finish decoding png
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
+              decode.channels * width * sizeof(uint8), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    } else {
+      // Finish decoding png
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
+              decode.channels * width * sizeof(uint16), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    }
+  }
+
+  void DecodeGif(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
+                errors::InvalidArgument("channels must be 0 or 3 for GIF, got ",
+                                        channels_));
+
+    // Decode GIF, allocating tensor once the size is known.
+    Tensor* output = nullptr;
+    OP_REQUIRES(
+        context,
+        gif::Decode(input.data(), input.size(),
+                    [=, &output](int num_frames, int width, int height,
+                                 int channels) -> uint8* {
+                      Status status;
+                      if (format_ == kGifFormat) {
+                        status = context->allocate_output(
+                            0,
+                            TensorShape({num_frames, height, width, channels}),
+                            &output);
+                      } else if (num_frames == 1) {
+                        status = context->allocate_output(
+                            0, TensorShape({height, width, channels}), &output);
+                      } else {
+                        status = errors::InvalidArgument(
+                            "Got ", num_frames, " frames, but animated gifs ",
+                            "can only be decoded by tf.image.decode_gif or ",
+                            "tf.image.decode_image");
+                      }
+                      if (!status.ok()) {
+                        VLOG(1) << status;
+                        context->SetStatus(status);
+                        return nullptr;
+                      }
+                      return output->flat<uint8>().data();
+                    }),
+        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
+  }
+
+ private:
+  FileFormat format_;
+  int channels_;
+  int channel_bits_ = 8;
+  jpeg::UncompressFlags flags_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeImageOp);
+REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodeImageOp);
+REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeImageOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_jpeg_op.cc b/tensorflow/core/kernels/decode_jpeg_op.cc
deleted file mode 100644
index b795f3955037d934ee89e9533b476438b032f2fc..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/decode_jpeg_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a JPEG file
-class DecodeJpegOp : public OpKernel {
- public:
-  explicit DecodeJpegOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("channels", &flags_.components));
-    OP_REQUIRES(context, flags_.components == 0 || flags_.components == 1 ||
-                             flags_.components == 3,
-                errors::InvalidArgument("channels must be 0, 1, or 3, got ",
-                                        flags_.components));
-    OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-    OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
-                             flags_.ratio == 4 || flags_.ratio == 8,
-                errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
-                                        flags_.ratio));
-    OP_REQUIRES_OK(
-        context, context->GetAttr("fancy_upscaling", &flags_.fancy_upscaling));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("try_recover_truncated",
-                                    &flags_.try_recover_truncated_jpeg));
-    OP_REQUIRES_OK(context, context->GetAttr("acceptable_fraction",
-                                             &flags_.min_acceptable_fraction));
-
-    string dct_method;
-    OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
-    OP_REQUIRES(
-        context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-                  dct_method == "INTEGER_ACCURATE"),
-        errors::InvalidArgument("dct_method must be one of "
-                                "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
-    if (dct_method == "INTEGER_FAST") {
-      flags_.dct_method = JDCT_IFAST;
-    } else if (dct_method == "INTEGER_ACCURATE") {
-      flags_.dct_method = JDCT_ISLOW;
-    } else {
-      // The TensorFlow-chosen default is IFAST, sacrificing decoding
-      // image quality for speed.
-      flags_.dct_method = JDCT_IFAST;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-    const StringPiece input = contents.scalar<string>()();
-    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument("JPEG contents are too large for int: ",
-                                        input.size()));
-
-    // Decode image, allocating tensor once the image size is known
-    Tensor* output = NULL;
-    OP_REQUIRES(
-        context,
-        jpeg::Uncompress(
-            input.data(), input.size(), flags_, nullptr /* nwarn */,
-            [=, &output](int width, int height, int channels) -> uint8* {
-              Status status(context->allocate_output(
-                  0, TensorShape({height, width, channels}), &output));
-              if (!status.ok()) {
-                VLOG(1) << status;
-                context->SetStatus(status);
-                return nullptr;
-              }
-              return output->flat<uint8>().data();
-            }),
-        errors::InvalidArgument("Invalid JPEG data, size ", input.size()));
-  }
-
- private:
-  jpeg::UncompressFlags flags_;
-};
-REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeJpegOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_png_op.cc b/tensorflow/core/kernels/decode_png_op.cc
deleted file mode 100644
index 1906ae7746c4f96e5392c77d6551e4d268304e76..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/decode_png_op.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/png/png_io.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a PNG file
-class DecodePngOp : public OpKernel {
- public:
-  explicit DecodePngOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
-                             channels_ == 4,
-                errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
-                                        channels_));
-
-    DataType dt;
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dt));
-    OP_REQUIRES(
-        context, dt == DataType::DT_UINT8 || dt == DataType::DT_UINT16,
-        errors::InvalidArgument("Type must be UINT8 or UINT16, got ", dt));
-    if (dt == DataType::DT_UINT8) {
-      desired_channel_bits_ = 8;
-    } else {
-      desired_channel_bits_ = 16;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-
-    // Start decoding image to get shape details
-    const StringPiece data = contents.scalar<string>()();
-    png::DecodeContext decode;
-    OP_REQUIRES(
-        context,
-        png::CommonInitDecode(data, channels_, desired_channel_bits_, &decode),
-        errors::InvalidArgument("Invalid PNG header, data size ", data.size()));
-
-    // Verify that width and height are not too large:
-    // - verify width and height don't overflow int.
-    // - width can later be multiplied by channels_ and sizeof(uint16), so
-    //   verify single dimension is not too large.
-    // - verify when width and height are multiplied together, there are a few
-    //   bits to spare as well.
-    const int width = static_cast<int>(decode.width);
-    const int height = static_cast<int>(decode.height);
-    const int64 total_size =
-        static_cast<int64>(width) * static_cast<int64>(height);
-    if (width != static_cast<int64>(decode.width) || width <= 0 ||
-        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
-        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
-      png::CommonFreeDecode(&decode);
-      OP_REQUIRES(context, false,
-                  errors::InvalidArgument("PNG size too large for int: ",
-                                          decode.width, " by ", decode.height));
-    }
-
-    // Allocate tensor
-    Tensor* output = nullptr;
-    const auto status = context->allocate_output(
-        0, TensorShape({height, width, decode.channels}), &output);
-    if (!status.ok()) png::CommonFreeDecode(&decode);
-    OP_REQUIRES_OK(context, status);
-
-    if (desired_channel_bits_ == 8) {
-      // Finish decoding image
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
-              decode.channels * width * sizeof(uint8), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", data.size()));
-    } else {
-      // Finish decoding image
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
-              decode.channels * width * sizeof(uint16), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", data.size()));
-    }
-  }
-
- private:
-  int channels_;
-  int desired_channel_bits_;
-};
-REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodePngOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index 4247abcd710c316942c40e648ebe315e018b0511..da247161f9aea3f150aa0f4d2d2c2b2543a3cce4 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -50,7 +50,7 @@ class DecodeRawOp : public OpKernel {
       }
     }
     TensorShape out_shape = input.shape();
-    if (str_size == -1) {  // Empty input
+    if (str_size == -1 || str_size == 0) {  // Empty input
       out_shape.AddDim(1);
       Tensor* output_tensor = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output("output", out_shape,
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 9e6d8e42a47b64c53cb014be7c0eeece45a7cf1e..a48140147985126bbf8e852d8f9c06bbab638c2d 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -1069,7 +1069,7 @@ struct DeepConv2D<CPUDevice, T> {
       // Allocate temporary buffer 'buffer2', which is first used for
       // transformed input tiles, then re-used for transformed output tiles.
       // Calculate required buffer size for 'buffer2' as max required buffer
-      // between input and output tranform buffer sizes.
+      // between input and output transform buffer sizes.
       const int64 buffer2_tile_transform_size =
           tile_spatial_size * num_tiles * in_depth;
       const int64 buffer2_out_transform_size =
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 26d45f79d82213c602fd3be15cf66e115f922876..2e7213f95686d45a68bb6ca0c25392e4dd672c13 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -441,7 +441,9 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
       DepthwiseConv2dNativeOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_CPU_KERNEL);
+#if defined(PLATFORM_WINDOWS) && !defined(_DEBUG)
 TF_CALL_double(REGISTER_CPU_KERNEL);
+#endif
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 5377d09ec69db701a337352a63d2d00a08e36dce..051d4772449cf9da429a5a901631b992337ab68d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -24,28 +24,32 @@ limitations under the License.
 
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
+#define NOUNROLL _Pragma("nounroll")
 #else
 #define UNROLL
+#define NOUNROLL
 #endif
 
 namespace tensorflow {
 
-namespace {
-
-typedef Eigen::GpuDevice GPUDevice;
+using Eigen::GpuDevice;
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NHWC format.
-template <typename T>
-__global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
-                                             const T* input, const T* filter,
-                                             T* output, int num_outputs) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
+                                 const T* filter, T* output, int num_outputs) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -114,16 +118,20 @@ __global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
-template <typename T>
-__global__ void DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
-                                             const T* input, const T* filter,
-                                             T* output, int num_outputs) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
+                                 const T* filter, T* output, int num_outputs) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -235,29 +243,58 @@ __global__ void DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
   }
 }
 
-}  // namespace
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
+                              const T* input, const T* filter, T* output,
+                              TensorFormat data_format) {
+  const int num_outputs =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  // The compile-time constant version runs faster with a single block.
+  const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
+                                      kKnownDepthMultiplier < 0 ||
+                                      args.out_rows * args.out_cols <= 256
+                                  ? std::numeric_limits<int>::max()
+                                  : d.getNumCudaMultiProcessors();
+  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                 kKnownDepthMultiplier>
+        <<<std::min(max_block_count, config.block_count),
+           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
+                                                     output, num_outputs);
+  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                 kKnownDepthMultiplier>
+        <<<std::min(max_block_count, config.block_count),
+           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
+                                                     output, num_outputs);
+  } else {
+    assert(false);
+  }
+}
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args, const T* input,
+  static void Run(const GpuDevice& d, const DepthwiseArgs args, const T* input,
                   const T* filter, T* output, TensorFormat data_format) {
-    // In this kernel, each thread is computing the gradients from one element
-    // in the out_backprop. Note that one element in the out_backprop can map
-    // to multiple filter elements.
-    const int num_outputs =
-        args.batch * args.out_rows * args.out_cols * args.out_depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d);
-    if (data_format == FORMAT_NHWC) {
-      DepthwiseConv2dGPUKernelNHWC<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, input, filter, output, num_outputs);
-    } else if (data_format == FORMAT_NCHW) {
-      DepthwiseConv2dGPUKernelNCHW<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, input, filter, output, num_outputs);
+    if (args.filter_rows == 3 && args.filter_cols == 3 &&
+        args.depth_multiplier == 1) {
+      LaunchDepthwiseConv2dGPU<T, 3, 3, 1>(d, args, input, filter, output,
+                                           data_format);
     } else {
-      assert(false);
+      LaunchDepthwiseConv2dGPU<T, -1, -1, -1>(d, args, input, filter, output,
+                                              data_format);
     }
   }
 };
@@ -266,18 +303,22 @@ template struct DepthwiseConv2dGPULaunch<float>;
 template struct DepthwiseConv2dGPULaunch<double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
-template <typename T, int KNOWN_DEPTH_MULTIPLIER>
-__global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
-    const DepthwiseArgs args, const T* out_backprop, const T* filter,
-    T* in_backprop, int num_in_backprop) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
+                                              const T* out_backprop,
+                                              const T* filter, T* in_backprop,
+                                              int num_in_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = KNOWN_DEPTH_MULTIPLIER == -1
-                                   ? args.depth_multiplier
-                                   : KNOWN_DEPTH_MULTIPLIER;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -301,14 +342,12 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
         tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
     const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
 
-#pragma nounroll
-    for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
+    NOUNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
       const int f_r = in_r + pad_rows - out_r * stride;
       const int temp_out_backprop_offset =
           out_depth * out_cols * (out_r + out_rows * b);
       const int temp_filter_offset = filter_cols * f_r;
-#pragma nounroll
-      for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
+      NOUNROLL for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
         const int f_c = in_c + pad_cols - out_c * stride;
         int filter_offset =
             depth_multiplier * (in_d + in_depth * (f_c + temp_filter_offset));
@@ -328,8 +367,9 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
   }
 }
 
-template <typename T>
-__global__ void __launch_bounds__(1024)
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
     DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
                                               const T* out_backprop,
                                               const T* filter, T* in_backprop,
@@ -337,9 +377,12 @@ __global__ void __launch_bounds__(1024)
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -395,34 +438,57 @@ __global__ void __launch_bounds__(1024)
   }
 }
 
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
+                                           const DepthwiseArgs args,
+                                           const T* out_backprop,
+                                           const T* filter, T* in_backprop,
+                                           TensorFormat data_format) {
+  const int num_in_backprop =
+      args.batch * args.in_rows * args.in_cols * args.in_depth;
+  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_in_backprop, d,
+        DepthwiseConv2dBackpropInputGPUKernelNHWC<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dBackpropInputGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, filter, in_backprop, num_in_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_in_backprop, d,
+        DepthwiseConv2dBackpropInputGPUKernelNCHW<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dBackpropInputGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, filter, in_backprop, num_in_backprop);
+  } else {
+    assert(false);
+  }
+}
+
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dBackpropInputGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
+  static void Run(const GpuDevice& d, const DepthwiseArgs args,
                   const T* out_backprop, const T* filter, T* in_backprop,
                   TensorFormat data_format) {
-    const int num_in_backprop =
-        args.batch * args.in_rows * args.in_cols * args.in_depth;
-
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_in_backprop, d);
-    // Increase block count for when there are more warps/SM than threads/SM.
-    config.block_count *= 4;
-    if (data_format == FORMAT_NHWC) {
-      if (args.depth_multiplier == 1) {
-        DepthwiseConv2dBackpropInputGPUKernelNHWC<T, 1>
-            <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                args, out_backprop, filter, in_backprop, num_in_backprop);
+    if (args.depth_multiplier == 1) {
+      if (args.filter_rows == 3 && args.filter_cols == 3) {
+        LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3, 1>(
+            d, args, out_backprop, filter, in_backprop, data_format);
       } else {
-        DepthwiseConv2dBackpropInputGPUKernelNHWC<T, -1>
-            <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                args, out_backprop, filter, in_backprop, num_in_backprop);
+        LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, 1>(
+            d, args, out_backprop, filter, in_backprop, data_format);
       }
-    } else if (data_format == FORMAT_NCHW) {
-      DepthwiseConv2dBackpropInputGPUKernelNCHW<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, out_backprop, filter, in_backprop, num_in_backprop);
     } else {
-      assert(false);
+      LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, -1>(
+          d, args, out_backprop, filter, in_backprop, data_format);
     }
   }
 };
@@ -431,16 +497,23 @@ template struct DepthwiseConv2dBackpropInputGPULaunch<float>;
 template struct DepthwiseConv2dBackpropInputGPULaunch<double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
-__global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
-    const DepthwiseArgs args, const T* out_backprop, const T* input,
-    T* filter_backprop, int num_out_backprop) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
+                                               const T* out_backprop,
+                                               const T* input,
+                                               T* filter_backprop,
+                                               int num_out_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -518,16 +591,23 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
 }
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
-__global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
-    const DepthwiseArgs args, const T* out_backprop, const T* input,
-    T* filter_backprop, int num_out_backprop) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
+                                               const T* out_backprop,
+                                               const T* input,
+                                               T* filter_backprop,
+                                               int num_out_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -610,28 +690,53 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
   }
 }
 
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
+                                            const DepthwiseArgs args,
+                                            const T* out_backprop,
+                                            const T* input, T* filter_backprop,
+                                            TensorFormat data_format) {
+  const int num_out_backprop =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, input, filter_backprop, num_out_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNCHW<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, input, filter_backprop, num_out_backprop);
+  } else {
+    assert(false);
+  }
+}
+
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dBackpropFilterGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
+  static void Run(const GpuDevice& d, const DepthwiseArgs args,
                   const T* out_backprop, const T* input, T* filter_backprop,
                   TensorFormat data_format) {
-    // In this kernel, each thread is computing the gradients for one element in
-    // the out_backprop.
-    const int num_out_backprop =
-        args.batch * args.out_rows * args.out_cols * args.out_depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_out_backprop, d);
-
-    if (data_format == FORMAT_NHWC) {
-      DepthwiseConv2dBackpropFilterGPUKernelNHWC<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, out_backprop, input, filter_backprop, num_out_backprop);
-    } else if (data_format == FORMAT_NCHW) {
-      DepthwiseConv2dBackpropFilterGPUKernelNCHW<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, out_backprop, input, filter_backprop, num_out_backprop);
+    if (args.filter_rows == 3 && args.filter_cols == 3 &&
+        args.depth_multiplier == 1) {
+      LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3, 1>(
+          d, args, out_backprop, input, filter_backprop, data_format);
     } else {
-      assert(false);
+      LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1, -1>(
+          d, args, out_backprop, input, filter_backprop, data_format);
     }
   }
 };
diff --git a/tensorflow/core/kernels/fake_quant_ops.cc b/tensorflow/core/kernels/fake_quant_ops.cc
index 41f9c218437f3a911843673affd7c4ae8813f4b9..c198f67bbb69b7fa38f5c5260e8210d21a289453 100644
--- a/tensorflow/core/kernels/fake_quant_ops.cc
+++ b/tensorflow/core/kernels/fake_quant_ops.cc
@@ -48,6 +48,10 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+namespace {
+bool IsNumBitsValid(int num_bits) { return num_bits >= 2 && num_bits <= 8; }
+}  // namespace
+
 // -----------------------------------------------------------------------------
 // Implementation of FakeQuantWithMinMaxArgsOp, see its documentation in
 // core/ops/array_ops.cc.
@@ -60,19 +64,25 @@ class FakeQuantWithMinMaxArgsOp
       : Base::UnaryElementWiseOp(context) {
     OP_REQUIRES_OK(context, context->GetAttr("min", &min_));
     OP_REQUIRES_OK(context, context->GetAttr("max", &max_));
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
     OP_REQUIRES(context, min_ < max_,
                 InvalidArgument("min has to be smaller than max, was: ", min_,
                                 " >= ", max_));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
   }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     FakeQuantWithMinMaxArgsFunctor<Device> functor;
     functor(context->eigen_device<Device>(), input.flat<float>(), min_, max_,
-            output->flat<float>());
+            steps_, output->flat<float>());
   }
  private:
   float min_;
   float max_;
+  int steps_;
 };
 
 // Implementation of FakeQuantWithMinMaxArgsGradientOp, see its documentation in
@@ -88,9 +98,14 @@ class FakeQuantWithMinMaxArgsGradientOp
       : Base::BinaryElementWiseOp(context) {
     OP_REQUIRES_OK(context, context->GetAttr("min", &min_));
     OP_REQUIRES_OK(context, context->GetAttr("max", &max_));
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
     OP_REQUIRES(context, min_ < max_,
                 InvalidArgument("min has to be smaller than max, was: ", min_,
                                 " >= ", max_));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
   }
 
   template <int NDIMS>
@@ -105,11 +120,12 @@ class FakeQuantWithMinMaxArgsGradientOp
                 InvalidArgument("gradient and input must be the same size"));
     FakeQuantWithMinMaxArgsGradientFunctor<Device> functor;
     functor(context->eigen_device<Device>(), gradient.flat<float>(),
-            input.flat<float>(), min_, max_, output->flat<float>());
+            input.flat<float>(), min_, max_, steps_, output->flat<float>());
   }
  private:
   float min_;
   float max_;
+  int steps_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxArgs").Device(DEVICE_CPU),
@@ -124,20 +140,16 @@ typedef Eigen::GpuDevice GPUDevice;
 // Forward declarations for functor specializations for GPU.
 template <>
 void FakeQuantWithMinMaxArgsFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat inputs,
-    const float min, const float max,
-    typename TTypes<float>::Flat outputs);
+    const GPUDevice& d, typename TTypes<float>::ConstFlat inputs, float min,
+    float max, int steps, typename TTypes<float>::Flat outputs);
 extern template struct FakeQuantWithMinMaxArgsFunctor<GPUDevice>;
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxArgs").Device(DEVICE_GPU),
                         FakeQuantWithMinMaxArgsOp<GPUDevice>);
 
 template <>
 void FakeQuantWithMinMaxArgsGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat gradients,
-    typename TTypes<float>::ConstFlat inputs,
-    const float min, const float max,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat gradients,
+    typename TTypes<float>::ConstFlat inputs, float min, float max, int steps,
     typename TTypes<float>::Flat backprops);
 REGISTER_KERNEL_BUILDER(
     Name("FakeQuantWithMinMaxArgsGradient").Device(DEVICE_GPU),
@@ -152,6 +164,11 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsOp(OpKernelConstruction* context)
       : OpKernel::OpKernel(context) {
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
 #ifndef FAKE_QUANT_NO_DEBUG
     OP_REQUIRES_OK(context,
                    context->allocate_persistent(DT_BOOL, {},
@@ -175,7 +192,7 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
 
     FakeQuantWithMinMaxVarsFunctor<Device> functor;
     functor(context->eigen_device<Device>(), input.flat<float>(),
-            min.scalar<float>(), max.scalar<float>(),
+            min.scalar<float>(), max.scalar<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
             check_min_max->scalar<bool>(),
 #endif
@@ -183,6 +200,7 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
   }
 
  private:
+  int steps_;
 #ifndef FAKE_QUANT_NO_DEBUG
   PersistentTensor check_min_max_handle_;
 #endif
@@ -195,6 +213,11 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsGradientOp(OpKernelConstruction* context)
       : OpKernel::OpKernel(context) {
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
 #ifndef FAKE_QUANT_NO_DEBUG
     OP_REQUIRES_OK(context,
                    context->allocate_persistent(DT_BOOL, {},
@@ -231,6 +254,7 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
     FakeQuantWithMinMaxVarsGradientFunctor<Device> functor;
     functor(context->eigen_device<Device>(), gradient.flat<float>(),
             input.flat<float>(), min.scalar<float>(), max.scalar<float>(),
+            steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
             check_min_max->scalar<bool>(),
 #endif
@@ -239,6 +263,7 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
   }
 
  private:
+  int steps_;
 #ifndef FAKE_QUANT_NO_DEBUG
   PersistentTensor check_min_max_handle_;
 #endif
@@ -253,10 +278,9 @@ REGISTER_KERNEL_BUILDER(
 #if GOOGLE_CUDA
 template <>
 void FakeQuantWithMinMaxVarsFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat inputs,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstScalar min,
-    typename TTypes<float>::ConstScalar max,
+    typename TTypes<float>::ConstScalar max, int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -270,11 +294,10 @@ REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVars")
 
 template <>
 void FakeQuantWithMinMaxVarsGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat gradients,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat gradients,
     typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstScalar min,
-    typename TTypes<float>::ConstScalar max,
+    typename TTypes<float>::ConstScalar max, int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -297,6 +320,11 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsPerChannelOp(OpKernelConstruction* context)
       : OpKernel::OpKernel(context) {
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
 #ifndef FAKE_QUANT_NO_DEBUG
     OP_REQUIRES_OK(context,
                    context->allocate_persistent(DT_BOOL, {},
@@ -330,7 +358,7 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
         FakeQuant4WithMinMaxVarsPerChannelFunctor<Device> functor;
         functor(context->eigen_device<Device>(), input.dim_size(0),
                 input.dim_size(1), input.dim_size(2), input.dim_size(3),
-                input.flat<float>(), min.vec<float>(), max.vec<float>(),
+                input.flat<float>(), min.vec<float>(), max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
@@ -339,9 +367,9 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
       }
       case 2: {
         FakeQuant2WithMinMaxVarsPerChannelFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                input.dim_size(0), input.dim_size(1),
-                input.flat<float>(), min.vec<float>(), max.vec<float>(),
+        functor(context->eigen_device<Device>(), input.dim_size(0),
+                input.dim_size(1), input.flat<float>(), min.vec<float>(),
+                max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
@@ -350,8 +378,8 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
       }
       case 1: {
         FakeQuant1WithMinMaxVarsPerChannelFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                input.vec<float>(), min.vec<float>(), max.vec<float>(),
+        functor(context->eigen_device<Device>(), input.vec<float>(),
+                min.vec<float>(), max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
@@ -366,6 +394,7 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
   }
 
  private:
+  int steps_;
 #ifndef FAKE_QUANT_NO_DEBUG
   PersistentTensor check_min_max_handle_;
 #endif
@@ -378,6 +407,11 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsPerChannelGradientOp(
       OpKernelConstruction* context) : OpKernel::OpKernel(context) {
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
 #ifndef FAKE_QUANT_NO_DEBUG
     OP_REQUIRES_OK(context,
                    context->allocate_persistent(DT_BOOL, {},
@@ -423,38 +457,36 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
         FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
         functor(context->eigen_device<Device>(), input.dim_size(0),
                 input.dim_size(1), input.dim_size(2), input.dim_size(3),
-                gradient.flat<float>(), input.flat<float>(),
-                min.vec<float>(), max.vec<float>(),
+                gradient.flat<float>(), input.flat<float>(), min.vec<float>(),
+                max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
-                grad_wrt_input->flat<float>(),
-                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
+                grad_wrt_input->flat<float>(), grad_wrt_min->vec<float>(),
+                grad_wrt_max->vec<float>());
         break;
       }
       case 2: {
         FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                input.dim_size(0), input.dim_size(1),
-                gradient.flat<float>(), input.flat<float>(),
-                min.vec<float>(), max.vec<float>(),
+        functor(context->eigen_device<Device>(), input.dim_size(0),
+                input.dim_size(1), gradient.flat<float>(), input.flat<float>(),
+                min.vec<float>(), max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
-                grad_wrt_input->flat<float>(),
-                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
+                grad_wrt_input->flat<float>(), grad_wrt_min->vec<float>(),
+                grad_wrt_max->vec<float>());
         break;
       }
       case 1: {
         FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                gradient.vec<float>(), input.vec<float>(),
-                min.vec<float>(), max.vec<float>(),
+        functor(context->eigen_device<Device>(), gradient.vec<float>(),
+                input.vec<float>(), min.vec<float>(), max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
-                grad_wrt_input->vec<float>(),
-                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
+                grad_wrt_input->vec<float>(), grad_wrt_min->vec<float>(),
+                grad_wrt_max->vec<float>());
         break;
       }
       default:
@@ -465,6 +497,7 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
   }
 
  private:
+  int steps_;
 #ifndef FAKE_QUANT_NO_DEBUG
   PersistentTensor check_min_max_handle_;
 #endif
@@ -480,10 +513,9 @@ REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannelGradient")
 #if GOOGLE_CUDA
 template <>
 void FakeQuant1WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstVec inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
+    const GPUDevice& d, typename TTypes<float>::ConstVec inputs,
+    typename TTypes<float>::ConstVec min, typename TTypes<float>::ConstVec max,
+    int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -495,7 +527,7 @@ void FakeQuant2WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
     const GPUDevice& d, const Index batch_size, const Index depth,
     typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstFlat min,
-    typename TTypes<float>::ConstFlat max,
+    typename TTypes<float>::ConstFlat max, int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -508,7 +540,7 @@ void FakeQuant4WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
     const Index width, const Index depth,
     typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstFlat min,
-    typename TTypes<float>::ConstFlat max,
+    typename TTypes<float>::ConstFlat max, int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -523,11 +555,10 @@ REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannel")
 
 template <>
 void FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstVec gradients,
+    const GPUDevice& d, typename TTypes<float>::ConstVec gradients,
     typename TTypes<float>::ConstVec inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
+    typename TTypes<float>::ConstVec min, typename TTypes<float>::ConstVec max,
+    int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -542,8 +573,8 @@ void FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
     const GPUDevice& d, const Index batch_size, const Index depth,
     typename TTypes<float>::ConstFlat gradients,
     typename TTypes<float>::ConstFlat inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
+    typename TTypes<float>::ConstVec min, typename TTypes<float>::ConstVec max,
+    int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -559,8 +590,8 @@ void FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
     const Index width, const Index depth,
     typename TTypes<float>::ConstFlat gradients,
     typename TTypes<float>::ConstFlat inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
+    typename TTypes<float>::ConstVec min, typename TTypes<float>::ConstVec max,
+    int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index 242eddfb799e14e98fa4034f396f4b903cae9f7c..1aefaec691d1e42e1cd6561a5be0bb682590b5e0 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -35,31 +35,27 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float StdRound(float input) {
 
 namespace tensorflow {
 
-static constexpr int kSteps = 255;
-static constexpr float kStepsFloat = static_cast<float>(kSteps);
-
 // Gymnastics with nudged zero point is to ensure that real zero maps to
 // an integer, which is required for e.g. zero-padding in convolutional layers.
 // Returns (nudged_min, nudged_max, nudged_scale).
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void Nudge(const float min,
-                                                 const float max,
-                                                 float* nudged_min,
-                                                 float* nudged_max,
-                                                 float* scale) {
-  *scale = (max - min) / (kStepsFloat - 0.0f);
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void Nudge(
+    const float min, const float max, const int steps, float* nudged_min,
+    float* nudged_max, float* scale) {
+  const float steps_float = static_cast<float>(steps);
+  *scale = (max - min) / (steps_float - 0.0f);
   const float zero_point_from_min = 0.0f - min / *scale;
-  const uint8 nudged_zero_point = [zero_point_from_min] {
+  const uint8 nudged_zero_point = [zero_point_from_min, steps, steps_float] {
     if (zero_point_from_min < 0.0f) {
       return static_cast<uint8>(0);
-    } else if (zero_point_from_min > kStepsFloat) {
-      return static_cast<uint8>(kSteps);
-    } else {
-      return static_cast<uint8>(StdRound(zero_point_from_min));
     }
+    if (zero_point_from_min > steps_float) {
+      return static_cast<uint8>(steps);
+    }
+    return static_cast<uint8>(StdRound(zero_point_from_min));
   }();
 
   *nudged_min = (0.0f - nudged_zero_point) * (*scale);
-  *nudged_max = (kStepsFloat - nudged_zero_point) * (*scale);
+  *nudged_max = (steps_float - nudged_zero_point) * (*scale);
 }
 
 template <typename T>
@@ -80,13 +76,13 @@ using Flat = typename tensorflow::TTypes<T>::Flat;
 template <typename Device>
 struct FakeQuantWithMinMaxArgsFunctor {
   void operator()(const Device& d, ConstFlat<float> inputs, const float min,
-                  const float max, Flat<float> outputs) {
+                  const float max, const int steps, Flat<float> outputs) {
     eigen_assert(min <= 0.0f && "min should be <= 0.0");
     eigen_assert(max >= 0.0f && "max should be >= 0.0");
     eigen_assert(min < max && "min should be < max");
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min, max, &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min, max, steps, &nudged_min, &nudged_max, &nudged_scale);
     const float inv_nudged_scale = 1.0f / nudged_scale;
 
     auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
@@ -103,13 +99,13 @@ template <typename Device>
 struct FakeQuantWithMinMaxArgsGradientFunctor {
   void operator()(const Device& d, ConstFlat<float> gradients,
                   ConstFlat<float> inputs, const float min, const float max,
-                  Flat<float> backprops) {
+                  const int steps, Flat<float> backprops) {
     eigen_assert(min <= 0.0f && "min should be <= 0.0");
     eigen_assert(max >= 0.0f && "max should be >= 0.0");
     eigen_assert(min < max && "min should be < max");
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min, max, &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min, max, steps, &nudged_min, &nudged_max, &nudged_scale);
 
     auto between_nudged_min_max =
         (inputs >= nudged_min && inputs <= nudged_max)
@@ -124,6 +120,7 @@ template <typename Device>
 struct FakeQuantWithMinMaxVarsFunctor {
   void operator()(const Device& d, ConstFlat<float> inputs,
                   ConstScalar<float> min, ConstScalar<float> max,
+                  const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -138,7 +135,7 @@ struct FakeQuantWithMinMaxVarsFunctor {
 #endif
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min(), max(), &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min(), max(), steps, &nudged_min, &nudged_max, &nudged_scale);
     const auto nudged_scale_repl = inputs.constant(nudged_scale);
 
     const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
@@ -155,7 +152,7 @@ template <typename Device>
 struct FakeQuantWithMinMaxVarsGradientFunctor {
   void operator()(const Device& d, ConstFlat<float> gradients,
                   ConstFlat<float> inputs, ConstScalar<float> min,
-                  ConstScalar<float> max,
+                  ConstScalar<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -172,7 +169,7 @@ struct FakeQuantWithMinMaxVarsGradientFunctor {
 #endif
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min(), max(), &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min(), max(), steps, &nudged_min, &nudged_max, &nudged_scale);
 
     const auto between_min_max =
         (inputs >= nudged_min && inputs <= nudged_max)
@@ -200,7 +197,7 @@ using Index = typename tensorflow::TTypes<float>::ConstTensor::Index;
 template <typename Device>
 struct FakeQuant1WithMinMaxVarsPerChannelFunctor {
   void operator()(const Device& d, ConstVec<float> inputs, ConstVec<float> min,
-                  ConstVec<float> max,
+                  ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -216,7 +213,7 @@ struct FakeQuant1WithMinMaxVarsPerChannelFunctor {
 
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const float clamped =
           std::max(std::min(inputs(i), nudged_max), nudged_min);
       const float clamped_shifted = clamped - nudged_min;
@@ -233,7 +230,7 @@ template <typename Device>
 struct FakeQuant2WithMinMaxVarsPerChannelFunctor {
   void operator()(const Device& d, const Index batch_size, const Index depth,
                   ConstFlat<float> inputs, ConstVec<float> min,
-                  ConstVec<float> max,
+                  ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -251,7 +248,7 @@ struct FakeQuant2WithMinMaxVarsPerChannelFunctor {
     const auto inputs_restored = inputs.reshape(restored);
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const auto clamped =
           inputs_restored.chip<1>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
       const auto clamped_shifted = clamped - nudged_min;
@@ -269,7 +266,7 @@ template <typename Device>
 struct FakeQuant4WithMinMaxVarsPerChannelFunctor {
   void operator()(const Device& d, const Index batch_size, const Index height,
                   const Index width, const Index depth, ConstFlat<float> inputs,
-                  ConstVec<float> min, ConstVec<float> max,
+                  ConstVec<float> min, ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -287,7 +284,7 @@ struct FakeQuant4WithMinMaxVarsPerChannelFunctor {
     const auto inputs_restored = inputs.reshape(restored);
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const auto clamped =
           inputs_restored.chip<3>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
       const auto clamped_shifted = clamped - nudged_min;
@@ -308,7 +305,7 @@ template <typename Device>
 struct FakeQuant1WithMinMaxVarsPerChannelGradientFunctor {
   void operator()(const Device& d, ConstVec<float> gradients,
                   ConstVec<float> inputs, ConstVec<float> min,
-                  ConstVec<float> max,
+                  ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -325,7 +322,7 @@ struct FakeQuant1WithMinMaxVarsPerChannelGradientFunctor {
 
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
 
       const bool between_min_max =
           inputs(i) >= nudged_min && inputs(i) <= nudged_max;
@@ -346,7 +343,7 @@ template <typename Device>
 struct FakeQuant2WithMinMaxVarsPerChannelGradientFunctor {
   void operator()(const Device& d, const Index batch_size, const Index depth,
                   ConstFlat<float> gradients, ConstFlat<float> inputs,
-                  ConstVec<float> min, ConstVec<float> max,
+                  ConstVec<float> min, ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -366,7 +363,7 @@ struct FakeQuant2WithMinMaxVarsPerChannelGradientFunctor {
     const auto inputs_restored = inputs.reshape(restored);
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const auto gradients_chip = gradients_restored.chip<1>(i);
       const auto inputs_chip = inputs_restored.chip<1>(i);
 
@@ -399,7 +396,7 @@ struct FakeQuant4WithMinMaxVarsPerChannelGradientFunctor {
   void operator()(const Device& d, const Index batch_size, const Index height,
                   const Index width, const Index depth,
                   ConstFlat<float> gradients, ConstFlat<float> inputs,
-                  ConstVec<float> min, ConstVec<float> max,
+                  ConstVec<float> min, ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -419,7 +416,7 @@ struct FakeQuant4WithMinMaxVarsPerChannelGradientFunctor {
     const auto inputs_restored = inputs.reshape(restored);
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const auto gradients_chip = gradients_restored.chip<3>(i);
       const auto inputs_chip = inputs_restored.chip<3>(i);
 
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 38ad345f0d3b346bf5c33d52d999051738df9401..2be92269655dd149e485ddd14fe7b092a94d776d 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -48,30 +48,94 @@ class QuantOpsTest : public OpsTestBase {
       inputs_.push_back({nullptr, input});
     }
   }
+
+  void RunTestFakeQuantWithMinMaxArgs(const int num_bits, const float min,
+                                      const float max, const TensorShape& shape,
+                                      const gtl::ArraySlice<float>& data,
+                                      gtl::ArraySlice<float> expected_data) {
+    TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
+                     .Input(FakeInput(DT_FLOAT))  // inputs
+                     .Attr("min", min)
+                     .Attr("max", max)
+                     .Attr("num_bits", num_bits)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    // Downstream inputs.
+    AddInputFromArray<float>(shape, data);
+
+    // Tested code.
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DT_FLOAT, shape);
+    FillValues<float>(&expected, expected_data);
+    ExpectClose(expected, *output);
+  }
+
+  void RunTestFakeQuantWithMinMaxVars(const int num_bits, const float min,
+                                      const float max, const TensorShape& shape,
+                                      const gtl::ArraySlice<float>& data,
+                                      gtl::ArraySlice<float> expected_data) {
+    TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+                     .Input(FakeInput(DT_FLOAT))  // inputs
+                     .Input(FakeInput(DT_FLOAT))  // min
+                     .Input(FakeInput(DT_FLOAT))  // max
+                     .Attr("num_bits", num_bits)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    // Downstream inputs.
+    AddInputFromArray<float>(shape, data);
+    // Min.
+    AddInputFromArray<float>(TensorShape({}), {min});
+    // Max.
+    AddInputFromArray<float>(TensorShape({}), {max});
+
+    // Tested code.
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+    FillValues<float>(&expected, expected_data);
+    ExpectClose(expected, *output);
+  }
+
+  void RunTestFakeQuantWithMinMaxVarsPerChannel(
+      const int num_bits, const TensorShape& minmax_shape,
+      const gtl::ArraySlice<float>& min, const gtl::ArraySlice<float>& max,
+      const TensorShape& shape, const gtl::ArraySlice<float>& data,
+      gtl::ArraySlice<float> expected_data) {
+    TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+                     .Input(FakeInput(DT_FLOAT))  // inputs
+                     .Input(FakeInput(DT_FLOAT))  // min
+                     .Input(FakeInput(DT_FLOAT))  // max
+                     .Attr("num_bits", num_bits)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    // Downstream inputs.
+    AddInputFromArray<float>(shape, data);
+    // Min.
+    AddInputFromArray<float>(minmax_shape, min);
+    // Max.
+    AddInputFromArray<float>(minmax_shape, max);
+
+    // Tested code.
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DT_FLOAT, shape);
+    FillValues<float>(&expected, expected_data);
+    ExpectClose(expected, *output);
+  }
 };
 
 TEST_F(QuantOpsTest, WithArgsNoNudging) {
   // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
   // Original zero point: 40, no nudging necessary.
-  // Expected quantized values: -10.0, -10.25, ..., 53.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -10.0f)
-                   .Attr("max", 53.75f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
-  ExpectClose(expected, *output);
+  // Expected quantized values: -10.0, -9.75, ..., 53.75.
+  RunTestFakeQuantWithMinMaxArgs(
+      8, -10.0f, 53.75f, TensorShape({2, 3}),
+      {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f},
+      {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
 }
 
 TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0) {
@@ -79,23 +143,9 @@ TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0) {
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged range: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -0.1f)
-                   .Attr("max", 63.65f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxArgs(8, -0.1f, 63.65f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f},
+                                 {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
 }
 
 TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1) {
@@ -103,23 +153,9 @@ TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1) {
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged range: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -0.125f)
-                   .Attr("max", 63.625f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxArgs(8, -0.125f, 63.625f, TensorShape({2, 3}),
+                                 {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f},
+                                 {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
 }
 
 TEST_F(QuantOpsTest, WithArgsNudgedZeroIs255) {
@@ -127,23 +163,78 @@ TEST_F(QuantOpsTest, WithArgsNudgedZeroIs255) {
   // Scale: 1/4,  original zero point: 254.6, nudged to 255.
   // Nudged range: [-63.75; 0.0].
   // Expected quantized values: -63.75, -63.5, -63.25, ..., 0.0.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -63.65f)
-                   .Attr("max", 0.1f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-63.8f, -63.75f, -63.7f, -63.5f, 0.0f, 0.1f});
+  RunTestFakeQuantWithMinMaxArgs(
+      8, -63.65f, 0.1f, TensorShape({2, 3}),
+      {-63.8f, -63.75f, -63.7f, -63.5f, 0.0f, 0.1f},
+      {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
+}
 
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
+TEST_F(QuantOpsTest, WithArgsNoNudging_4Bits) {
+  // Original quantization range: [-6 + 0 / 2, -6 + 15 / 2], scale: 1/2.
+  // Original zero point: 12, no nudging necessary.
+  // Expected quantized values: -6, -5.5, ..., 1.5.
+  RunTestFakeQuantWithMinMaxArgs(4, -6.0f, 1.5f, TensorShape({2, 3}),
+                                 {-6.1f, -6.0f, -5.9f, -5.5f, 1.5f, 1.6f},
+                                 {-6.0f, -6.0f, -6.0f, -5.5f, 1.5f, 1.5f});
+}
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
-  ExpectClose(expected, *output);
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxArgs(4, -0.1f, 7.4f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.7f},
+                                 {0.0f, 0.0f, 0.0f, 0.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxArgs(4, -0.4f, 7.1f, TensorShape({2, 3}),
+                                 {-0.51f, -0.5f, -0.24f, 0.0f, 7.0f, 7.1f},
+                                 {-0.5f, -0.5f, 0.0f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs15_4Bits) {
+  // Original quantization range: [0.4 / 2 - 15 / 2, 0.4 / 2 + 0 / 2].
+  // Scale: 1/2,  original zero point: 14.6, nudged to 15.
+  // Nudged range: [-7.5; 0.0].
+  // Expected quantized values: -7.5, -7.0, ..., 0.0.
+  RunTestFakeQuantWithMinMaxArgs(4, -7.3f, 0.2f, TensorShape({2, 3}),
+                                 {-7.6f, -7.5f, -7.4f, -7.2f, 0.0f, 0.1f},
+                                 {-7.5f, -7.5f, -7.5f, -7.0f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNoNudging_2Bits) {
+  // Original quantization range: [-1 + 0 / 2, -1 + 3 / 2], scale: 1/2.
+  // Original zero point: 2, no nudging necessary.
+  // Expected quantized values: -1.0, -0.5, 0.0, 0.5.
+  RunTestFakeQuantWithMinMaxArgs(2, -1.0f, 0.5f, TensorShape({2, 3}),
+                                 {-1.1f, -1.0f, -0.9f, -0.3f, 0.1f, 1.0f},
+                                 {-1.0f, -1.0f, -1.0f, -0.5f, 0.0f, 0.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0_2Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 3 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 1.5].
+  // Expected quantized values: 0.0, 0.5, 1.0, 1.5.
+  RunTestFakeQuantWithMinMaxArgs(2, -0.1f, 1.4f, TensorShape({2, 3}),
+                                 {-0.2f, 0.1f, 0.7f, 1.0f, 1.3f, 1.6f},
+                                 {0.0f, 0.0f, 0.5f, 1.0f, 1.5f, 1.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1_2Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 3 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 1.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, 1.0.
+  RunTestFakeQuantWithMinMaxArgs(2, -0.4f, 1.1f, TensorShape({2, 3}),
+                                 {-0.51f, -0.5f, -0.24f, 0.0f, 1.0f, 1.1f},
+                                 {-0.5f, -0.5f, 0.0f, 0.0f, 1.0f, 1.0f});
 }
 
 TEST_F(QuantOpsTest, WithArgsGradient) {
@@ -176,74 +267,130 @@ TEST_F(QuantOpsTest, WithArgsGradient) {
   ExpectClose(expected, *output);
 }
 
-TEST_F(QuantOpsTest, WithVarsNoNudging) {
-  // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
-  // Original zero point: 40, no nudging necessary.
-  // Expected quantized values: -10.0, -10.25, ..., 53.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+TEST_F(QuantOpsTest, WithArgsGradient_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgsGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradient
                    .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("min", -0.4f)
+                   .Attr("max", 7.1f)
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({}), {-10.0f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({}), {53.75f});
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 7.0f, 7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor* output = GetOutput(0);
+  auto input_flat = GetInput(0).flat<float>();
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
   ExpectClose(expected, *output);
 }
 
+TEST_F(QuantOpsTest, WithVarsNoNudging) {
+  // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
+  // Original zero point: 40, no nudging necessary.
+  // Expected quantized values: -10.0, -10.25, ..., 53.75.
+  RunTestFakeQuantWithMinMaxVars(
+      8, -10.0f, 53.75f, TensorShape({2, 3}),
+      {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f},
+      {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
+}
+
 TEST_F(QuantOpsTest, WithVarsNudgedZeroIs0) {
   // Original quantization range: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged range: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({}), {-0.1f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({}), {63.65f});
+  RunTestFakeQuantWithMinMaxVars(8, -0.1f, 63.65f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f},
+                                 {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
+}
 
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1) {
+  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged range: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVars(8, -0.125f, 63.625f, TensorShape({2, 3}),
+                                 {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f},
+                                 {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
+}
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs255) {
+  // Original quantization range: [0.4 / 4 - 255 / 4, 0.4 / 4 + 0 / 4].
+  // Scale: 1/4,  original zero point: 254.6, nudged to 255.
+  // Nudged range: [-63.75; 0.0].
+  // Expected quantized values: -63.75, -63.5, -63.25, ..., 0.0.
+  RunTestFakeQuantWithMinMaxVars(
+      8, -63.65f, 0.1f, TensorShape({2, 3}),
+      {-63.8f, -63.75f, -63.7f, -63.5f, 0.0f, 0.1f},
+      {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
 }
 
-TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsNoNudging_4Bits) {
+  // Original quantization range: [-6 + 0 / 2, -6 + 15 / 2], scale: 1/2.
+  // Original zero point: 12, no nudging necessary.
+  // Expected quantized values: -6, -5.5, ..., 1.5.
+  RunTestFakeQuantWithMinMaxVars(4, -6.0f, 1.5f, TensorShape({2, 3}),
+                                 {-6.1f, -6.0f, -5.9f, -5.5f, 1.5f, 1.6f},
+                                 {-6.0f, -6.0f, -6.0f, -5.5f, 1.5f, 1.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVars(4, -0.1f, 7.4f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.7f},
+                                 {0.0f, 0.0f, 0.0f, 0.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVars(4, -0.4f, 7.1f, TensorShape({2, 3}),
+                                 {-0.51f, -0.5f, -0.24f, 0.0f, 7.0f, 7.1f},
+                                 {-0.5f, -0.5f, 0.0f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs15_4Bits) {
+  // Original quantization range: [0.4 / 2 - 15 / 2, 0.4 / 2 + 0 / 2].
+  // Scale: 1/2,  original zero point: 14.6, nudged to 15.
+  // Nudged range: [-7.5; 0.0].
+  // Expected quantized values: -7.5, -7.0, ..., 0.0.
+  RunTestFakeQuantWithMinMaxVars(4, -7.3f, 0.2f, TensorShape({2, 3}),
+                                 {-7.6f, -7.5f, -7.4f, -7.2f, 0.0f, 0.1f},
+                                 {-7.5f, -7.5f, -7.5f, -7.0f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsGradient) {
   // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged range: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
@@ -255,34 +402,46 @@ TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto in_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input, {0.0f, in_flat(1), in_flat(2),
+                                                in_flat(3), in_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_min.flat<float>()(0) = in_flat(0);
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_max.flat<float>()(0) = in_flat(5);
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsGradient) {
-  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged range: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest, WithVarsGradient_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 7.0f, 7.1f});
   // Min.
-  AddInputFromArray<float>(TensorShape({}), {-0.125f});
+  AddInputFromArray<float>(TensorShape({}), {-0.4f});
   // Max.
-  AddInputFromArray<float>(TensorShape({}), {63.625f});
+  AddInputFromArray<float>(TensorShape({}), {7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -290,10 +449,8 @@ TEST_F(QuantOpsTest, WithVarsGradient) {
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto in_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, in_flat(1),
-                     in_flat(2), in_flat(3),
-                     in_flat(4), 0.0f});
+  FillValues<float>(&expected_bprop_wrt_input, {0.0f, in_flat(1), in_flat(2),
+                                                in_flat(3), in_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
@@ -312,12 +469,182 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs0) {
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged ranges: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {63.65f, 63.65f, 63.65f, 63.65f}, TensorShape({4}),
+      {-0.1f, 0.0f, 63.75f, 63.8f}, {0.0f, 0.0f, 63.75f, 63.75f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({4}), {-0.125f, -0.125f, -0.125f, -0.125f},
+      {63.625f, 63.625f, 63.625f, 63.625f}, TensorShape({4}),
+      {-0.26f, -0.25f, -0.24f, 63.6f}, {-0.25f, -0.25f, -0.25f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({3}), {-0.1f, -0.1f, -0.1f}, {63.65f, 63.65f, 63.65f},
+      TensorShape({2, 3}), {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f},
+      {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({3}), {-0.125f, -0.125f, -0.125f},
+      {63.625f, 63.625f, 63.625f}, TensorShape({2, 3}),
+      {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f},
+      {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {63.65f, 63.65f, 63.65f, 63.65f}, TensorShape({1, 2, 3, 4}),
+      {-0.1f, 0.0f,   0.1f,   0.25f,  0.5f,   0.75f,
+       1.0f,  1.25f,  1.5f,   1.75f,  2.0f,   2.25f,
+
+       63.0f, 63.25f, 63.5f,  63.7f,  63.75f, 63.8f,
+       63.9f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      {0.0f,   0.0f,   0.0f,   0.25f,  0.5f,   0.75f,
+       1.0f,   1.25f,  1.5f,   1.75f,  2.0f,   2.25f,
+
+       63.0f,  63.25f, 63.5f,  63.75f, 63.75f, 63.75f,
+       63.75f, 63.75f, 63.75f, 63.75f, 63.75f, 63.75f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({4}), {-0.125f, -0.125f, -0.125f, -0.125f},
+      {63.625f, 63.625f, 63.625f, 63.625f}, TensorShape({1, 2, 3, 4}),
+      {-0.3f,  -0.25f, -0.2f,  0.0f,   0.25f,  0.5f,
+       0.75f,  1.0f,   1.25f,  1.5f,   1.75f,  2.0f,
+
+       63.0f,  63.25f, 63.4f,  63.5f,  63.6f,  63.7f,
+       100.0f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      {-0.25f, -0.25f, -0.25f, 0.0f,  0.25f, 0.5f,
+       0.75f,  1.0f,   1.25f,  1.5f,  1.75f, 2.0f,
+
+       63.0f,  63.25f, 63.5f,  63.5f, 63.5f, 63.5f,
+       63.5f,  63.5f,  63.5f,  63.5f, 63.5f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {7.4f, 7.4f, 7.4f, 7.4f}, TensorShape({4}), {-0.1f, 0.0f, 7.5f, 7.6f},
+      {0.0f, 0.0f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f},
+      {7.1f, 7.1f, 7.1f, 7.1f}, TensorShape({4}), {-0.51f, -0.5f, -0.24f, 7.1f},
+      {-0.5f, -0.5f, -0.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({3}), {-0.1f, -0.1f, -0.1f}, {7.4f, 7.4f, 7.4f},
+      TensorShape({2, 3}), {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.6f},
+      {0.0f, 0.0f, 0.0f, 0.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({3}), {-0.4f, -0.4f, -0.4f}, {7.1f, 7.1f, 7.1f},
+      TensorShape({2, 3}), {-0.51f, -0.5f, -0.24f, 0.0f, 7.0f, 7.1f},
+      {-0.5f, -0.5f, -0.0f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {7.4f, 7.4f, 7.4f, 7.4f}, TensorShape({1, 2, 3, 4}),
+      {-0.1f, 0.0f,   0.1f,   0.5f,   1.0f,   1.5f,
+       1.5f,  2.0f,   2.5f,   3.0f,   3.5f,   4.0f,
+
+       6.0f,  6.5f,   7.0f,   7.4f,   7.5f,   7.7f,
+       7.8f,  100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      {0.0f, 0.0f, 0.0f, 0.5f, 1.0f, 1.5f, 1.5f, 2.0f, 2.5f, 3.0f, 3.5f, 4.0f,
+
+       6.0f, 6.5f, 7.0f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f},
+      {7.1f, 7.1f, 7.1f, 7.1f}, TensorShape({1, 2, 3, 4}),
+      {-0.6f,  -0.5f,  -0.4f,  0.0f,   0.5f,   1.0f,
+       1.5f,   2.0f,   2.5f,   3.0f,   3.5f,   4.0f,
+
+       6.0f,   6.5f,   6.9f,   7.0f,   7.1f,   7.7f,
+       100.0f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      {-0.5f, -0.5f, -0.5f, 0.0f, 0.5f, 1.0f,
+       1.5f,  2.0f,  2.5f,  3.0f, 3.5f, 4.0f,
+
+       6.0f,  6.5f,  7.0f,  7.0f, 7.0f, 7.0f,
+       7.0f,  7.0f,  7.0f,  7.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 63.75f, 63.8f});
   // Min.
@@ -328,25 +655,40 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs0) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected, {0.0f, 0.0f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1) {
   // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged ranges: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
   // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({4}), {-0.26f, -0.25f, -0.24f, 63.6f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.3f, -0.25f, 63.5f, 63.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}),
                            {-0.125f, -0.125f, -0.125f, -0.125f});
@@ -357,27 +699,41 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f, 63.5f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged ranges: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f,
-                           0.25f, 63.75f, 63.8f});
+                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
   // Min.
   AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
   // Max.
@@ -386,28 +742,42 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {0.0f, 0.0f, 0.0f,
-                                0.25f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
   // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged ranges: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.26f, -0.25f, -0.24f,
-                            0.0f, 63.5f, 63.6f});
+                           {-0.3f, -0.25f, -0.2f, 0.0f, 63.5f, 63.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({3}), {-0.125f, -0.125f, -0.125f});
   // Max.
@@ -416,33 +786,46 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f,
-                                0.0f, 63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged ranges: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.1f, 0.0f, 0.1f, 0.25f,
-                             0.5f, 0.75f, 1.0f, 1.25f,
-                             1.5f, 1.75f, 2.0f, 2.25f,
+                           {-0.1f,  0.0f,  63.75f, 63.8f, -0.1f,  0.0f,
+                            63.75f, 63.8f, -0.1f,  0.0f,  63.75f, 63.8f,
 
-                             63.0f,  63.25f, 63.5f,   63.7f,
-                             63.75f, 63.8f,  63.9f,  100.0f,
-                            100.0f, 100.0f, 100.0f, 1000.0f});
+                            -0.1f,  0.0f,  63.75f, 63.8f, -0.1f,  0.0f,
+                            63.75f, 63.8f, -0.1f,  0.0f,  63.75f, 63.8f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
   // Max.
@@ -451,39 +834,58 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
-  FillValues<float>(&expected,
-                    {0.0f, 0.0f,  0.0f, 0.25f,
-                     0.5f, 0.75f, 1.0f, 1.25f,
-                     1.5f, 1.75f, 2.0f, 2.25f,
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
 
-                     63.0f,  63.25f, 63.5f,  63.75f,
-                     63.75f, 63.75f, 63.75f, 63.75f,
-                     63.75f, 63.75f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1) {
   // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged ranges: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.3f, -0.25f, -0.2f,  0.0f,
-                             0.25f, 0.5f,   0.75f, 1.0f,
-                             1.25f, 1.5f,   1.75f, 2.0f,
+                           {-0.3f, -0.25f, 63.5f, 63.6f,  -0.3f, -0.25f,
+                            63.5f, 63.6f,  -0.3f, -0.25f, 63.5f, 63.6f,
 
-                             63.0f,  63.25f, 63.4f,   63.5f,
-                             63.6f,  63.7f, 100.0f,  100.0f,
-                            100.0f, 100.0f, 100.0f, 1000.0f});
+                            -0.3f, -0.25f, 63.5f, 63.6f,  -0.3f, -0.25f,
+                            63.5f, 63.6f,  -0.3f, -0.25f, 63.5f, 63.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}),
                            {-0.125f, -0.125f, -0.125f, -0.125f});
@@ -494,39 +896,58 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
-  FillValues<float>(&expected,
-                    {-0.25f, -0.25f, -0.25f, 0.0f,
-                      0.25f,  0.5f,   0.75f, 1.0f,
-                      1.25f,  1.5f,   1.75f, 2.0f,
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
 
-                      63.0f, 63.25f, 63.5f, 63.5f,
-                      63.5f, 63.5f,  63.5f, 63.5f,
-                      63.5f, 63.5f,  63.5f, 63.5f});
-  ExpectClose(expected, *output);
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0) {
-  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
-  // Nudged ranges: [0.0; 63.75].
-  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({4}));
   // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 63.75f, 63.8f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 7.5f, 7.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+  AddInputFromArray<float>(TensorShape({4}), {7.4f, 7.4f, 7.4f, 7.4f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -551,28 +972,27 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({4}));
   // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({4}), {-0.3f, -0.25f, 63.5f, 63.6f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.6f, -0.5f, 7.0f, 7.1f});
   // Min.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {63.625f, 63.625f, 63.625f, 63.625f});
+  AddInputFromArray<float>(TensorShape({4}), {7.1f, 7.1f, 7.1f, 7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -597,28 +1017,28 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
-  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
-  // Nudged ranges: [0.0; 63.75].
-  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f,
-                            0.25f, 63.75f, 63.8f});
+                           {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
   // Max.
-  AddInputFromArray<float>(TensorShape({3}), {63.65f, 63.65f, 63.65f});
+  AddInputFromArray<float>(TensorShape({3}), {7.4f, 7.4f, 7.4f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -644,28 +1064,28 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.3f, -0.25f, -0.2f,
-                            0.0f, 63.5f, 63.6f});
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 7.0f, 7.1f});
   // Min.
-  AddInputFromArray<float>(TensorShape({3}), {-0.125f, -0.125f, -0.125f});
+  AddInputFromArray<float>(TensorShape({3}), {-0.4f, -0.4f, -0.4f});
   // Max.
-  AddInputFromArray<float>(TensorShape({3}), {63.625f, 63.625f, 63.625f});
+  AddInputFromArray<float>(TensorShape({3}), {7.1f, 7.1f, 7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -691,33 +1111,32 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
-  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
-  // Nudged ranges: [0.0; 63.75].
-  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f,
+                           {-0.1f, 0.0f, 7.5f,  7.6f, -0.1f, 0.0f,
+                            7.5f,  7.6f, -0.1f, 0.0f, 7.5f,  7.6f,
 
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f});
+                            -0.1f, 0.0f, 7.5f,  7.6f, -0.1f, 0.0f,
+                            7.5f,  7.6f, -0.1f, 0.0f, 7.5f,  7.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+  AddInputFromArray<float>(TensorShape({4}), {7.4f, 7.4f, 7.4f, 7.4f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -754,35 +1173,32 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f,
+                           {-0.6f, -0.5f, 7.0f,  7.1f,  -0.6f, -0.5f,
+                            7.0f,  7.1f,  -0.6f, -0.5f, 7.0f,  7.1f,
 
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f});
+                            -0.6f, -0.5f, 7.0f,  7.1f,  -0.6f, -0.5f,
+                            7.0f,  7.1f,  -0.6f, -0.5f, 7.0f,  7.1f});
   // Min.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {63.625f, 63.625f, 63.625f, 63.625f});
+  AddInputFromArray<float>(TensorShape({4}), {7.1f, 7.1f, 7.1f, 7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index ba408f365710e518b966f976613f7b78f1d542a4..8c3137ece9fa902c12452f262c9d647afce9d231 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -186,12 +186,12 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
                         PassOn);
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                      \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
-      PassOn);                                                           \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      PassOn);                                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       PassOn);
 
 REGISTER_SYCL_KERNELS(float);
@@ -211,7 +211,7 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         PassOn);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class SymbolicGradientOp : public AsyncOpKernel {
  public:
@@ -227,7 +227,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
 
     FunctionLibraryRuntime::Handle handle;
     OP_REQUIRES_OK_ASYNC(
-        ctx, lib->Instantiate(kGradientOp, def().attr(), &handle), done);
+        ctx, lib->Instantiate(kGradientOp, AttrSlice(def()), &handle), done);
 
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index e3fc228cc708221f782a828c3ba16b62d7bef2c6..d927ef3efa08bf7f0fdb255e21b59b0620475a83 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -140,7 +140,7 @@ Status GraphTransferer::LoadGraphFromProto(
     std::vector<DataType> data_types;
     std::vector<TensorShape> shapes;
     status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-        node->def(), &data_types, &shapes);
+        node->attrs(), &data_types, &shapes);
     if (status.ok()) {
       CHECK(data_types.size() > port);
       graph_output_node_info.set_dtype(data_types.at(port));
@@ -309,8 +309,9 @@ Status GraphTransferer::RegisterNode(
     RegisterNodeWithPaddingAndStrides(ops_definitions, shape_refiner, node);
   } else if (IsNodeFlattenReshape(node, shape_refiner)) {
     RegisterFlattenNode(ops_definitions, shape_refiner, node);
-  } else if (ops_definitions.GetOpIdFor(node.type_string()) !=
+  } else if (ops_definitions.GetOpIdFor(node.type_string(), {}) !=
              IGraphTransferOpsDefinitions::INVALID_OP_ID) {
+    // TODO(satok): Set correct data type if it's given.
     RegisterGenericNode(ops_definitions, shape_refiner, node);
   } else {
     return errors::InvalidArgument(node.type_string() +
@@ -358,7 +359,7 @@ void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner,
   const_node_info.add_shape(shape_array[2]);
   const_node_info.add_shape(shape_array[3]);
   const TensorProto* proto = nullptr;
-  TF_CHECK_OK(GetNodeAttr(node.def(), "value", &proto));
+  TF_CHECK_OK(GetNodeAttr(node.attrs(), "value", &proto));
   Tensor const_tensor;
   // TODO(b/32704451): Don't just ignore this status!
   MakeTensorFromProto(*proto, &const_tensor).IgnoreError();
@@ -394,8 +395,9 @@ int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) {
 }
 
 bool GraphTransferer::HasPaddingAndStrides(const Node& node) {
-  return node.def().attr().count(PADDING_ATTR_NAME) > 0 &&
-         node.def().attr().count(STRIDES_ATTR_NAME) > 0;
+  auto attrs = node.attrs();
+  return attrs.Find(PADDING_ATTR_NAME) != nullptr &&
+         attrs.Find(STRIDES_ATTR_NAME) != nullptr;
 }
 
 bool GraphTransferer::IsNodeFlattenReshape(const Node& node,
@@ -422,7 +424,7 @@ bool GraphTransferer::IsNodeFlattenReshape(const Node& node,
   } else {
     std::vector<TensorShape> shapes;
     TF_CHECK_OK(RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-        node.def(), nullptr, &shapes));
+        node.attrs(), nullptr, &shapes));
 
     // Number of outputs should be 1 for reshape node.
     CHECK_EQ(1, shapes.size());
@@ -443,22 +445,23 @@ void GraphTransferer::RegisterNodeWithPaddingAndStrides(
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
   shape_inference::InferenceContext* context = shape_refiner.GetContext(&node);
-  CHECK_GT(node.def().attr().count(PADDING_ATTR_NAME), 0);
+  CHECK(node.attrs().Find(PADDING_ATTR_NAME));
   // TODO(satok): Use context->GetAttr(...) instead?
   Padding padding;
   TF_CHECK_OK(context->GetAttr(PADDING_ATTR_NAME, &padding));
-  CHECK_GT(node.def().attr().count(STRIDES_ATTR_NAME), 0);
+  CHECK(node.attrs().Find(STRIDES_ATTR_NAME));
   std::vector<int32> strides;
   TF_CHECK_OK(context->GetAttr(STRIDES_ATTR_NAME, &strides));
   const int stride_id = RegisterConstantShape(strides);
   std::vector<int> extra_inputs{stride_id};
-  if (node.def().attr().count(KSIZE_ATTR_NAME) > 0) {
+  if (node.attrs().Find(KSIZE_ATTR_NAME)) {
     std::vector<int32> kernel_sizes;
     TF_CHECK_OK(context->GetAttr(KSIZE_ATTR_NAME, &kernel_sizes));
     const int ksize_id = RegisterConstantShape(kernel_sizes);
     extra_inputs.insert(extra_inputs.begin(), ksize_id);
   }
-  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string());
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
       << "Op " << node.type_string() << " not found in map(id = " << op_type_id
       << ")";
@@ -477,7 +480,8 @@ void GraphTransferer::RegisterInputNode(
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
   const string op_type = node.type_string();
-  const int op_type_id = ops_definitions.GetOpIdFor(op_type);
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(op_type, {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
       << "Op" << node.name() << ", " << op_type << " is not supported,"
       << op_type_id;
@@ -494,7 +498,8 @@ void GraphTransferer::RegisterFlattenNode(
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
   const string op_type = IGraphTransferOpsDefinitions::FLATTEN_OP_NAME;
-  const int op_type_id = ops_definitions.GetOpIdFor(op_type);
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(op_type, {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
 
   AppendNodeParamsWithIoParams(
@@ -509,7 +514,8 @@ void GraphTransferer::RegisterGenericNode(
   VLOG(1) << "Register generic node: " << node.name();
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
-  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string());
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
 
   AppendNodeParamsWithIoParams(
@@ -592,7 +598,7 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
 
   std::vector<TensorShape> shapes;
   Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-      node.def(), nullptr, &shapes);
+      node.attrs(), nullptr, &shapes);
 
   for (int i = 0; i < node.num_outputs(); ++i) {
     int data_size = -1;
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index 142c9e128325923a9b3f57a729c856bfe428c1b6..ebd4a90330155958da4c1324f368116a2e8f48e8 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -54,16 +54,18 @@ const RemoteFusedGraphExecuteUtils::TensorShapeMap EMPTY_OUTPUT_TENSOR_MAP;
 class TestGraphTransferOpsDefinitions : public IGraphTransferOpsDefinitions {
  public:
   int GetTotalOpsCount() const final { return OP_TYPES.size(); }
-  int GetOpIdFor(const string& op_type) const final {
-    for (int i = 0; i < OP_TYPES.size(); ++i) {
-      if (OP_TYPES[i] == op_type) {
-        return i;
-      }
+
+int GetOpIdFor(const string& op_type, const DataTypeVector&) const final {
+  for (int i = 0; i < OP_TYPES.size(); ++i) {
+    if (OP_TYPES[i] == op_type) {
+      return i;
     }
-    return -1;
   }
-  GraphTransferInfo::Destination GetTransferDestination() const final {
-    return GraphTransferInfo::NOP;
+  return -1;
+}
+
+GraphTransferInfo::Destination GetTransferDestination() const final {
+  return GraphTransferInfo::NOP;
   }
 
  private:
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index 4e6944e63e07f0fb392f57b0f3661f3a5a322868..518b399c37482dd7b5ad1ef333f86c6e97f75631 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -88,7 +88,7 @@ bool HexagonControlWrapper::SetupGraph() {
     CHECK_NE(node_info, nullptr);
     node_info->set_type_name(INPUT_OP_NAME);
     node_info->set_soc_op_id(
-        HexagonOpsDefinitions::getInstance().GetOpIdFor(INPUT_OP_NAME));
+        HexagonOpsDefinitions::getInstance().GetOpIdFor(INPUT_OP_NAME, {}));
   }
 
   // Generate a new output node which is connected to graph output node
@@ -106,7 +106,7 @@ bool HexagonControlWrapper::SetupGraph() {
     new_output_node_info.set_node_id(new_output_node_id);
     new_output_node_info.set_type_name(OUTPUT_OP_NAME);
     new_output_node_info.set_soc_op_id(
-        HexagonOpsDefinitions::getInstance().GetOpIdFor(OUTPUT_OP_NAME));
+        HexagonOpsDefinitions::getInstance().GetOpIdFor(OUTPUT_OP_NAME, {}));
     new_output_node_info.set_padding_id(0 /* PADDING_NA_ID */);
     new_output_node_info.set_input_count(1);
     new_output_node_info.set_output_count(0);
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
index 099ebe8fb6480f64f98b8d42293747989347c858..67f26b6db97b376351ae475b47059304ff602c5d 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
@@ -15,14 +15,18 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 
-#include <unordered_map>
-
 #include "tensorflow/core/framework/types.h"
 
+// CAVEAT: Comment-out the following macro if you want to use experimental
+// hexagon ops.
+//#define ENABLE_EXPERIMENTAL_HEXNN_OPS
+
 namespace tensorflow {
 
 // HVX internal supported ops names
-enum class SupportedOpType {
+// TODO(satok): Remove this map once hexnn lib supports an API to retrieve op id
+// from op name and data type
+enum class HexagonOpsDefinitions::SupportedOpType {
   INPUT,
   OUTPUT,
   NOP,
@@ -38,6 +42,136 @@ enum class SupportedOpType {
   PPRINT_FLOAT,
   PREFREE,
   FLATTEN,
+
+#ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
+  // With Reference
+  QUANTIZEDCONV2D_8X8TO32,
+  QUANTIZEDCONV2D_8X8TO32_REF,
+  QUANTIZEDMATMUL_8X8TO32,
+  QUANTIZEDMATMUL_8X8TO32_REF,
+  QUANTIZEDOWNANDSHRINKRANGE_32TO8,
+  QUANTIZEDOWNANDSHRINKRANGE_32TO8_REF,
+  QUANTIZEDRELU_8,
+  QUANTIZEDRELU_8_REF,
+  QUANTIZEDRELUX_8,
+  QUANTIZEDRELUX_8_REF,
+  QUANTIZEDMAXPOOL_8,
+  QUANTIZEDMAXPOOL_8_REF,
+  QUANTIZEDAVGPOOL_8,
+  QUANTIZEDAVGPOOL_8_REF,
+  QUANTIZEDCONCAT_8,
+  QUANTIZEDCONCAT_8_REF,
+  QUANTIZEDBIASADD_8P8TO32,
+  QUANTIZEDBIASADD_8P8TO32_REF,
+  MIN_F,
+  MIN_F_REF,
+  MAX_F,
+  MAX_F_REF,
+  QUANTIZE,
+  QUANTIZE_REF,
+  DEQUANTIZE,
+  DEQUANTIZE_REF,
+  SUPERNODE_8X8P8TO8,
+  SUPERNODE_8X8P8TO8_REF,
+
+  QUANTIZEDFLATTEN,
+  SOFTMAX_F,
+  CONV2D_F,
+  MATMUL_F,
+  RELU_F,
+  RELUX_F,
+  AVGPOOL_F,
+  MAXPOOL_F,
+  CONCAT_F,
+  BIASADD_F,
+  LRN_F,
+
+  VARIABLE,
+  ASSIGN,
+  RESHAPE,
+  QUANTIZED_RESHAPE,
+  TANH_F,
+  SIGMOID_F,
+  SLICE_8,
+  SLICE_F,
+  QUANTIZED_SLICE_8,
+  ADD_F,
+  MUL_F,
+  MINIMUM_F,
+  MAXIMAM_F,
+
+  REQUANTIZE_32_TO_8,
+  REQUANTIZE_32_TO_8_REF,
+  REQUANTIZATION_RANGE_32,
+  REQUANTIZATION_RANGE_32_REF,
+
+  NEG_F,
+  SUB_F,
+  ADD_N_F,
+  RANGE_INT32,
+  RANK_INT32,
+  TRANSPOSE_INT32,
+  TRANSPOSE_F,
+  INSTANCE_NORM_F,
+  QUANTIZED_INSTANCENORM_8,
+  QUANTIZED_INSTANCENORM_8_REF,
+  SUB_INT32,
+  ADD_INT32,
+  SPLIT_F,
+  DEQUANTIZE_QINT32_F,
+  PRELU_F,
+  QUANTIZED_PRELU_8,
+  SUM_F,
+  PROD_F,
+  MUL_INT32,
+  LOGICAL_AND_INT32,
+  LOGICALOR_INT32,
+  LOGICAL_XOR_INT32,
+  SPAPE_INT32,
+  PACK_INT32,
+  MIRROR_PAD_F,
+  RESIZE_NEAREST_NEIGHBOR_F,
+  STRIDED_SLICE_INT32,
+  STRIDED_SLICE_F,
+  EXPAND_DIMS_INT32,
+  EXPAND_DIMS_F,
+
+  LOG_SOFTMAX_F,
+  SPLIT_INT32,
+  QUANTIZED_SPLIT_8,
+
+  DECONV_F,
+  QUANTIZED_DECONV_8X8TO32,
+  QUANTIZED_DECONV_8X8TO32_REF,
+
+  QUANTIZED_MUL_8x8to32,
+  QUANTIZED_MUL_8x8to32_REF,
+  QUANTIZED_ADD_8p8to32,
+  QUANTIZED_ADD_8p8to32_REF,
+  QUANTIZED_SIGMOID_8,
+  QUANTIZED_SIGMOID_8_REF,
+  QUANTIZED_TANH_8,
+  QUANTIZED_TANH_8_REF,
+  QUANTIZED_SOFTMAX_8,
+  QUANTIZED_SOFTMAX_8_REF,
+  QUANTIZED_LRN_8,
+  QUANTIZED_LRN_8_REF,
+  QUANTIZED_PAD2D_FRAME_8P,
+  QUANTIZED_PAD2D_FRAME_8P_REF,
+  QUANTIZED_SUB_8P8TO32,
+  QUANTIZED_SUB_8P8TO32_REF,
+  QUANTIZED_MAXIMUM_8,
+  QUANTIZED_MAXIMUM_8_REF,
+  QUANTIZED_MINIMUM_8,
+  QUANTIZED_MINIMUM_8_REF,
+
+  PAD_F,
+  SPACE_TO_BATCH_ND_F,
+  BATCH_TO_SPACE_ND_F,
+  RESIZE_BILINEAR_F,
+  CONCAT_V2_F,
+
+#else
   // With Reference
   QUANTIZEDCONV2D_8X8TO32,
   QUANTIZEDCONV2D_8X8TO32_REF,
@@ -145,43 +279,92 @@ enum class SupportedOpType {
   DECONV_F,
   QUANTIZED_DECONV_8X8TO32,
   QUANTIZED_DECONV_8X8TO32_REF,
+#endif
 
   SUPPORTED_OP_TYPE_COUNT  // TERMINATOR. DO NOT REMOVE
 };
 
-const std::unordered_map<string, SupportedOpType> OP_NAME_TO_SOC_OP_TYPE_MAP{
-    // Custom Op name
-    {"INPUT", SupportedOpType::INPUT},
-    {"OUTPUT", SupportedOpType::OUTPUT},
-    {"NoOp", SupportedOpType::NOP},
-    {IGraphTransferOpsDefinitions::FLATTEN_OP_NAME, SupportedOpType::FLATTEN},
-    // Tensorflow op name
-    {"Const", SupportedOpType::OP_CONST},
-    {"QuantizedConv2D", SupportedOpType::QUANTIZEDCONV2D_8X8TO32},
-    {"QuantizedMatMul", SupportedOpType::QUANTIZEDMATMUL_8X8TO32},
-    {"QuantizeDownAndShrinkRange",
-     SupportedOpType::QUANTIZEDOWNANDSHRINKRANGE_32TO8},
-    {"QuantizedRelu", SupportedOpType::QUANTIZEDRELU_8},
-    {"QuantizedReluX", SupportedOpType::QUANTIZEDRELUX_8},
-    {"QuantizedMaxPool", SupportedOpType::QUANTIZEDMAXPOOL_8},
-    {"QuantizedAvgPool", SupportedOpType::QUANTIZEDAVGPOOL_8},
-    {"QuantizedConcat", SupportedOpType::QUANTIZEDCONCAT_8},
-    {"QuantizedBiasAdd", SupportedOpType::QUANTIZEDBIASADD_8P8TO32},
-    {"Min", SupportedOpType::MIN_F},
-    {"Max", SupportedOpType::MAX_F},
-    {"QuantizeV2", SupportedOpType::QUANTIZE},
-    {"Dequantize", SupportedOpType::DEQUANTIZE},
-    {"Softmax", SupportedOpType::SOFTMAX_F},
-    {"Placeholder", SupportedOpType::NOP},
-    {"RequantizationRange", SupportedOpType::REQUANTIZATION_RANGE_32},
-    {"Requantize", SupportedOpType::REQUANTIZE_32_TO_8},
-    {"QuantizedReshape", SupportedOpType::QUANTIZED_RESHAPE},
-    {"Add", SupportedOpType::ADD_F},
-    {"Sub", SupportedOpType::SUB_F},
-    {"Reshape", SupportedOpType::RESHAPE},
-    {"Identity", SupportedOpType::NOP},
+/* static */ void HexagonOpsDefinitions::EmplaceOpType(
+    const string& op_type, const DataTypeVector& dt_vec,
+    const SupportedOpType supported_op_type,
+    std::unordered_map<string, std::vector<DataTypeToOp>>* map) {
+  if (map->count(op_type) <= 0) {
+    map->emplace(op_type, std::vector<DataTypeToOp>());
+  }
+  map->at(op_type).emplace_back(
+      std::forward_as_tuple(dt_vec, supported_op_type));
+}
+
+/* static */ std::unordered_map<
+    string, std::vector<HexagonOpsDefinitions::DataTypeToOp>>
+HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() {
+  std::unordered_map<string, std::vector<DataTypeToOp>> op_map;
+  // Custom Op name
+  EmplaceOpType("INPUT", {}, SupportedOpType::INPUT, &op_map);
+  EmplaceOpType("OUTPUT", {}, SupportedOpType::OUTPUT, &op_map);
+  EmplaceOpType("NoOp", {}, SupportedOpType::NOP, &op_map);
+  EmplaceOpType(IGraphTransferOpsDefinitions::FLATTEN_OP_NAME, {},
+                SupportedOpType::FLATTEN, &op_map);
+  // Tensorflow op name
+  // CAVEAT: Keep order of SupportedOpType
+  EmplaceOpType("Identity", {}, SupportedOpType::NOP, &op_map);
+  EmplaceOpType("Placeholder", {}, SupportedOpType::NOP, &op_map);
+  EmplaceOpType("Const", {}, SupportedOpType::OP_CONST, &op_map);
+  EmplaceOpType("QuantizedConv2D", {}, SupportedOpType::QUANTIZEDCONV2D_8X8TO32,
+                &op_map);
+  EmplaceOpType("QuantizedMatMul", {}, SupportedOpType::QUANTIZEDMATMUL_8X8TO32,
+                &op_map);
+  EmplaceOpType("QuantizeDownAndShrinkRange", {},
+                SupportedOpType::QUANTIZEDOWNANDSHRINKRANGE_32TO8, &op_map);
+  EmplaceOpType("QuantizedRelu", {}, SupportedOpType::QUANTIZEDRELU_8, &op_map);
+  EmplaceOpType("QuantizedReluX", {}, SupportedOpType::QUANTIZEDRELUX_8,
+                &op_map);
+  EmplaceOpType("QuantizedMaxPool", {}, SupportedOpType::QUANTIZEDMAXPOOL_8,
+                &op_map);
+  EmplaceOpType("QuantizedAvgPool", {}, SupportedOpType::QUANTIZEDAVGPOOL_8,
+                &op_map);
+  EmplaceOpType("QuantizedConcat", {}, SupportedOpType::QUANTIZEDCONCAT_8,
+                &op_map);
+  EmplaceOpType("QuantizedBiasAdd", {},
+                SupportedOpType::QUANTIZEDBIASADD_8P8TO32, &op_map);
+  EmplaceOpType("Min", {}, SupportedOpType::MIN_F, &op_map);
+  EmplaceOpType("Max", {}, SupportedOpType::MAX_F, &op_map);
+  EmplaceOpType("QuantizeV2", {}, SupportedOpType::QUANTIZE, &op_map);
+  EmplaceOpType("Dequantize", {}, SupportedOpType::DEQUANTIZE, &op_map);
+  EmplaceOpType("Softmax", {}, SupportedOpType::SOFTMAX_F, &op_map);
+  EmplaceOpType("Reshape", {}, SupportedOpType::RESHAPE, &op_map);
+  EmplaceOpType("QuantizedReshape", {}, SupportedOpType::QUANTIZED_RESHAPE,
+                &op_map);
+  EmplaceOpType("Sigmoid", {}, SupportedOpType::SIGMOID_F, &op_map);
+  EmplaceOpType("Slice", {}, SupportedOpType::SLICE_F, &op_map);
+  EmplaceOpType("Add", {}, SupportedOpType::ADD_F, &op_map);
+  EmplaceOpType("Mul", {}, SupportedOpType::MUL_F, &op_map);
+  EmplaceOpType("Requantize", {}, SupportedOpType::REQUANTIZE_32_TO_8, &op_map);
+  EmplaceOpType("RequantizationRange", {},
+                SupportedOpType::REQUANTIZATION_RANGE_32, &op_map);
+  EmplaceOpType("Sub", {}, SupportedOpType::SUB_F, &op_map);
+  EmplaceOpType("Pack", {}, SupportedOpType::PACK_INT32, &op_map);
+  EmplaceOpType("StridedSlice", {}, SupportedOpType::STRIDED_SLICE_F, &op_map);
+  EmplaceOpType("ExpandDims", {}, SupportedOpType::EXPAND_DIMS_F, &op_map);
+#ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
+  EmplaceOpType("QuantizedMul", {}, SupportedOpType::QUANTIZED_MUL_8x8to32,
+                &op_map);
+  EmplaceOpType("Pad", {}, SupportedOpType::PAD_F, &op_map);
+  EmplaceOpType("SpaceToBatchND", {}, SupportedOpType::SPACE_TO_BATCH_ND_F,
+                &op_map),
+      EmplaceOpType("BatchToSpaceND", {}, SupportedOpType::BATCH_TO_SPACE_ND_F,
+                    &op_map);
+  EmplaceOpType("ResizeBilinear", {}, SupportedOpType::RESIZE_BILINEAR_F,
+                &op_map);
+  EmplaceOpType("ConcatV2", {}, SupportedOpType::CONCAT_V2_F, &op_map);
+  EmplaceOpType("Conv2DBackpropInput", {}, SupportedOpType::DECONV_F, &op_map);
+#endif
+  return op_map;
 };
 
+HexagonOpsDefinitions::HexagonOpsDefinitions()
+    : op_name_to_soc_op_type_map_(BuildOpNameToSocOpTypeMap()) {}
+
 /* static */ const IGraphTransferOpsDefinitions&
 HexagonOpsDefinitions::getInstance() {
   const static HexagonOpsDefinitions instance{};
@@ -192,9 +375,21 @@ int HexagonOpsDefinitions::GetTotalOpsCount() const {
   return static_cast<int>(SupportedOpType::SUPPORTED_OP_TYPE_COUNT);
 }
 
-int HexagonOpsDefinitions::GetOpIdFor(const string& op_type) const {
-  if (OP_NAME_TO_SOC_OP_TYPE_MAP.count(op_type) > 0) {
-    return static_cast<int>(OP_NAME_TO_SOC_OP_TYPE_MAP.at(op_type));
+int HexagonOpsDefinitions::GetOpIdFor(const string& op_type,
+                                      const DataTypeVector& dt_vec) const {
+  if (op_name_to_soc_op_type_map_.count(op_type) > 0) {
+    const std::vector<DataTypeToOp>& dt_to_op_vec =
+        op_name_to_soc_op_type_map_.at(op_type);
+    CHECK(!dt_to_op_vec.empty());
+    // If argument DataType is empty, return the first entry.
+    if (dt_vec.empty()) {
+      return static_cast<int>(std::get<1>(dt_to_op_vec.front()));
+    }
+    for (const DataTypeToOp& data_type_to_op : dt_to_op_vec) {
+      if (std::get<0>(data_type_to_op) == dt_vec) {
+        return static_cast<int>(std::get<1>(data_type_to_op));
+      }
+    }
   }
   return IGraphTransferOpsDefinitions::INVALID_OP_ID;
 }
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
index eca4e16f68d4c2eef216e540e349edf88105ed2e..bd1120e1df64ca72b2a3a95d7af91fabf693af98 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
 
+#include <unordered_map>
+
 #include "i_graph_transfer_ops_definitions.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
@@ -29,11 +31,25 @@ class HexagonOpsDefinitions final : public IGraphTransferOpsDefinitions {
   static const IGraphTransferOpsDefinitions& getInstance();
 
   int GetTotalOpsCount() const final;
-  int GetOpIdFor(const string& op_type) const final;
+  int GetOpIdFor(const string& op_type, const DataTypeVector& dt) const final;
   GraphTransferInfo::Destination GetTransferDestination() const final;
 
  private:
-  HexagonOpsDefinitions() = default;
+  enum class SupportedOpType;
+  using DataTypeToOp = std::tuple<DataTypeVector, SupportedOpType>;
+
+  HexagonOpsDefinitions();
+
+  static void EmplaceOpType(
+      const string& op_type, const DataTypeVector& dt_vec,
+      const SupportedOpType supported_op_type,
+      std::unordered_map<string, std::vector<DataTypeToOp>>* map);
+
+  static std::unordered_map<string, std::vector<DataTypeToOp>>
+  BuildOpNameToSocOpTypeMap();
+
+  const std::unordered_map<string, std::vector<DataTypeToOp>>
+      op_name_to_soc_op_type_map_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HexagonOpsDefinitions);
 };
diff --git a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
index 031b2e31cc8d24c717c794bd92fd5d3ebbd5f194..3d6f493a9c163f326d094bd05071a94965990cbe 100644
--- a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
@@ -37,7 +37,8 @@ class IGraphTransferOpsDefinitions {
   // Return total ops count supported by SOC
   virtual int GetTotalOpsCount() const = 0;
   // Return op id for given string op name
-  virtual int GetOpIdFor(const string& op_name) const = 0;
+  virtual int GetOpIdFor(const string& op_name,
+                         const DataTypeVector& dt) const = 0;
   // Return destination of transfer
   virtual GraphTransferInfo::Destination GetTransferDestination() const = 0;
 
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 2a27476b09ed762df6333c392c502fe2940044c6..36907fb5716fcde3b0efc28cc4edca543432c8f4 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -97,10 +97,6 @@ void LinearAlgebraOp<Scalar>::Compute(OpKernelContext* context) {
   PrepareOutputs(context, input_matrix_shapes, batch_shape, &outputs,
                  &output_matrix_shapes);
 
-  // Perform batch-wide pre-computions, if any.
-  BatchPreCompute(context, inputs, input_matrix_shapes, outputs,
-                  output_matrix_shapes);
-
   // Process the individual matrix problems in parallel using a threadpool.
   auto shard = [this, &inputs, &input_matrix_shapes, &outputs,
                 &output_matrix_shapes, context](int64 begin, int64 end) {
@@ -113,9 +109,6 @@ void LinearAlgebraOp<Scalar>::Compute(OpKernelContext* context) {
   Shard(worker_threads.num_threads, worker_threads.workers,
         batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
 
-  // Perform batch-wide post-computions, if any.
-  BatchPostCompute(context, inputs, input_matrix_shapes, outputs,
-                   output_matrix_shapes);
 }
 
 template <typename Scalar>
@@ -155,7 +148,8 @@ void LinearAlgebraOp<Scalar>::AnalyzeInputs(OpKernelContext* context,
     const int col_dimension = input_rank - 1;
     const int64 num_rows = in.dim_size(row_dimension);
     const int64 num_cols = in.dim_size(col_dimension);
-    input_matrix_shapes->emplace_back(std::initializer_list<int64>({num_rows, num_cols}));
+    input_matrix_shapes->emplace_back(
+        std::initializer_list<int64>({num_rows, num_cols}));
     inputs->emplace_back(&in);
   }
   // Have the derived class validate that the inputs are as expected.
@@ -233,8 +227,7 @@ void LinearAlgebraOp<Scalar>::ComputeTensorSlice(
     matrix_inputs.emplace_back(
         inputs[i]->flat<Scalar>().data() +
             matrix_index * input_matrix_shapes[i].num_elements(),
-        input_matrix_shapes[i].dim_size(0),
-        input_matrix_shapes[i].dim_size(1));
+        input_matrix_shapes[i].dim_size(0), input_matrix_shapes[i].dim_size(1));
   }
 
   MatrixMaps matrix_outputs;
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
index 75d1d2979347aabe2ae035a249097c77c0a10ae4..1d31786728f5c4aac023d7c4ef1e347577267110 100644
--- a/tensorflow/core/kernels/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -127,26 +127,9 @@ class LinearAlgebraOp : public OpKernel {
                              const ConstMatrixMaps& inputs,
                              MatrixMaps* outputs) = 0;
 
+ private:
   using TensorInputs = gtl::InlinedVector<const Tensor*, 4>;
   using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
-
-  // Hook for doing batch-wide processing before ComputeMatrix is called
-  // on each individual slice.
-  virtual void BatchPreCompute(OpKernelContext* context,
-                               const TensorInputs& inputs,
-                               const TensorShapes& input_matrix_shapes,
-                               const TensorOutputs& outputs,
-                               const TensorShapes& output_matrix_shapes) {}
-
-  // Hook for doing batch-wide processing after ComputeMatrix is called
-  // on each individual slice.
-  virtual void BatchPostCompute(OpKernelContext* context,
-                                const TensorInputs& inputs,
-                                const TensorShapes& input_matrix_shapes,
-                                const TensorOutputs& outputs,
-                                const TensorShapes& output_matrix_shapes) {}
-
- private:
   // This function maps 2-d slices (matrices) of the input and output tensors
   // using Eigen::Map and calls ComputeMatrix implemented in terms of the
   // Eigen::MatrixBase API by the derived class.
@@ -180,8 +163,8 @@ class LinearAlgebraOp : public OpKernel {
                       TensorShapes* output_matrix_shapes);
 };
 
-// Declare that LinearAlgebraOp is explicitly instantiated in
-// linalg_ops_common.cc for float and double.
+// Declare LinearAlgebraOp, which is explicitly instantiated in
+// linalg_ops_common.cc for float, double, complex64, and complex128.
 extern template class LinearAlgebraOp<float>;
 extern template class LinearAlgebraOp<double>;
 extern template class LinearAlgebraOp<complex64>;
@@ -196,9 +179,7 @@ extern template class LinearAlgebraOp<complex128>;
   using MatrixMaps = typename Base::MatrixMaps;           \
   using ConstMatrixMap = typename Base::ConstMatrixMap;   \
   using ConstMatrixMaps = typename Base::ConstMatrixMaps; \
-  using TensorShapes = typename Base::TensorShapes;       \
-  using TensorInputs = typename Base::TensorInputs;       \
-  using TensorOutputs = typename Base::TensorOutputs
+  using TensorShapes = typename Base::TensorShapes;
 
 #define REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar) \
   REGISTER_KERNEL_BUILDER(                              \
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index bde1d0360a92859d1d7d9843d60e81f4ee4e7368..ada6fe8d95045040191d48808f6ff11ad2435322 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -367,7 +367,9 @@ class InitializeTableOp : public OpKernel {
                    GetInitializableLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     DataTypeVector expected_outputs = {};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
@@ -408,6 +410,8 @@ class InitializeTableOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("InitializeTable").Device(DEVICE_CPU),
                         InitializeTableOp);
+REGISTER_KERNEL_BUILDER(Name("InitializeTableV2").Device(DEVICE_CPU),
+                        InitializeTableOp);
 
 // Kernel to initialize a lookup table from a text file.
 //
@@ -433,7 +437,9 @@ class InitializeTableFromTextFileOp : public OpKernel {
                    GetInitializableLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, DT_STRING};
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, DT_STRING};
     DataTypeVector expected_outputs = {};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
 
@@ -472,5 +478,8 @@ class InitializeTableFromTextFileOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("InitializeTableFromTextFile").Device(DEVICE_CPU),
                         InitializeTableFromTextFileOp);
+REGISTER_KERNEL_BUILDER(
+    Name("InitializeTableFromTextFileV2").Device(DEVICE_CPU),
+    InitializeTableFromTextFileOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 0a065e37d76b9e1faa0f2e8a1910b2f019226f23..11ce2a71dcb5f60f2c5274120cacb186f2076424 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -624,7 +624,10 @@ class LookupTableFindOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    // Input 0 could be a STRING_REF or a RESOURCE
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     DataTypeVector expected_outputs = {table->value_dtype()};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
@@ -647,6 +650,8 @@ class LookupTableFindOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableFind").Device(DEVICE_CPU),
                         LookupTableFindOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableFindV2").Device(DEVICE_CPU),
+                        LookupTableFindOp);
 
 // Table insert op.
 class LookupTableInsertOp : public OpKernel {
@@ -658,7 +663,9 @@ class LookupTableInsertOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
 
@@ -680,6 +687,8 @@ class LookupTableInsertOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableInsert").Device(DEVICE_CPU),
                         LookupTableInsertOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableInsertV2").Device(DEVICE_CPU),
+                        LookupTableInsertOp);
 
 // Op that returns the size of the given table.
 class LookupTableSizeOp : public OpKernel {
@@ -699,6 +708,8 @@ class LookupTableSizeOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableSize").Device(DEVICE_CPU),
                         LookupTableSizeOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableSizeV2").Device(DEVICE_CPU),
+                        LookupTableSizeOp);
 
 // Op that outputs tensors of all keys and all values.
 class LookupTableExportOp : public OpKernel {
@@ -716,6 +727,8 @@ class LookupTableExportOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableExport").Device(DEVICE_CPU),
                         LookupTableExportOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableExportV2").Device(DEVICE_CPU),
+                        LookupTableExportOp);
 
 // Clear the table and insert data.
 class LookupTableImportOp : public OpKernel {
@@ -727,7 +740,9 @@ class LookupTableImportOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
 
@@ -749,6 +764,8 @@ class LookupTableImportOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableImport").Device(DEVICE_CPU),
                         LookupTableImportOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableImportV2").Device(DEVICE_CPU),
+                        LookupTableImportOp);
 
 // Register the HashTable op with the currently supported key and value types.
 #define REGISTER_KERNEL(key_dtype, value_dtype)                           \
@@ -757,6 +774,13 @@ REGISTER_KERNEL_BUILDER(Name("LookupTableImport").Device(DEVICE_CPU),
           .Device(DEVICE_CPU)                                             \
           .TypeConstraint<key_dtype>("key_dtype")                         \
           .TypeConstraint<value_dtype>("value_dtype"),                    \
+      LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
+                    value_dtype>)                                         \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("HashTableV2")                                                 \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<key_dtype>("key_dtype")                         \
+          .TypeConstraint<value_dtype>("value_dtype"),                    \
       LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
                     value_dtype>)
 
@@ -778,6 +802,13 @@ REGISTER_KERNEL(string, bool);
           .Device(DEVICE_CPU)                                                  \
           .TypeConstraint<key_dtype>("key_dtype")                              \
           .TypeConstraint<value_dtype>("value_dtype"),                         \
+      LookupTableOp<lookup::MutableHashTableOfScalars<key_dtype, value_dtype>, \
+                    key_dtype, value_dtype>)                                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MutableHashTableV2")                                               \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
       LookupTableOp<lookup::MutableHashTableOfScalars<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
@@ -796,6 +827,13 @@ REGISTER_KERNEL(int64, float);
           .Device(DEVICE_CPU)                                                  \
           .TypeConstraint<key_dtype>("key_dtype")                              \
           .TypeConstraint<value_dtype>("value_dtype"),                         \
+      LookupTableOp<lookup::MutableHashTableOfTensors<key_dtype, value_dtype>, \
+                    key_dtype, value_dtype>)                                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MutableHashTableOfTensorsV2")                                      \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
       LookupTableOp<lookup::MutableHashTableOfTensors<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
@@ -813,6 +851,13 @@ REGISTER_KERNEL(string, bool);
           .Device(DEVICE_CPU)                                              \
           .TypeConstraint<key_dtype>("key_dtype")                          \
           .TypeConstraint<value_dtype>("value_dtype"),                     \
+      LookupTableOp<lookup::MutableDenseHashTable<key_dtype, value_dtype>, \
+                    key_dtype, value_dtype>)                               \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MutableDenseHashTableV2")                                      \
+          .Device(DEVICE_CPU)                                              \
+          .TypeConstraint<key_dtype>("key_dtype")                          \
+          .TypeConstraint<value_dtype>("value_dtype"),                     \
       LookupTableOp<lookup::MutableDenseHashTable<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index ae253b4dc96114f2fb2a8240a0e96551bafda46f..ff23a09a24f3c291aaec546577ead757e3eaa422 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -51,40 +51,52 @@ class LookupTableOp : public OpKernel {
   // ctx is not owned by this function.
   void Compute(OpKernelContext* ctx) override {
     mutex_lock l(mu_);
+
     if (!table_handle_set_) {
       OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
                                       use_node_name_sharing_));
-      auto creator = [ctx, this](lookup::LookupInterface** ret) {
-        lookup::LookupInterface* container = new Container(ctx, this);
-        if (!ctx->status().ok()) {
-          container->Unref();
-          return ctx->status();
-        }
-        if (ctx->track_allocations()) {
-          ctx->record_device_persistent_memory_allocation(
-              container->MemoryUsed());
-        }
-        *ret = container;
-        return Status::OK();
-      };
-
-      lookup::LookupInterface* table = nullptr;
-      OP_REQUIRES_OK(
-          ctx, cinfo_.resource_manager()
-                   ->template LookupOrCreate<lookup::LookupInterface>(
-                       cinfo_.container(), cinfo_.name(), &table, creator));
-      core::ScopedUnref unref_me(table);
-
-      OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes(
-                              *table, DataTypeToEnum<key_dtype>::v(),
-                              DataTypeToEnum<value_dtype>::v(), cinfo_.name()));
-
-      auto h = table_handle_.AccessTensor(ctx)->template flat<string>();
-      h(0) = cinfo_.container();
-      h(1) = cinfo_.name();
-      table_handle_set_ = true;
     }
-    ctx->set_output_ref(0, &mu_, table_handle_.AccessTensor(ctx));
+
+    auto creator = [ctx, this](lookup::LookupInterface** ret) {
+      lookup::LookupInterface* container = new Container(ctx, this);
+      if (!ctx->status().ok()) {
+        container->Unref();
+        return ctx->status();
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_host_persistent_memory_allocation(
+            container->MemoryUsed() + table_handle_.AllocatedBytes());
+      }
+      *ret = container;
+      return Status::OK();
+    };
+
+    lookup::LookupInterface* table = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   cinfo_.resource_manager()
+                       ->template LookupOrCreate<lookup::LookupInterface>(
+                           cinfo_.container(), cinfo_.name(), &table, creator));
+    core::ScopedUnref unref_me(table);
+
+    OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes(
+                            *table, DataTypeToEnum<key_dtype>::v(),
+                            DataTypeToEnum<value_dtype>::v(), cinfo_.name()));
+
+    if (ctx->expected_output_dtype(0) == DT_RESOURCE) {
+      Tensor* handle;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+      handle->scalar<ResourceHandle>()() =
+          MakeResourceHandle<lookup::LookupInterface>(ctx, cinfo_.container(),
+                                                      cinfo_.name());
+    } else {
+      if (!table_handle_set_) {
+        auto h = table_handle_.AccessTensor(ctx)->template flat<string>();
+        h(0) = cinfo_.container();
+        h(1) = cinfo_.name();
+      }
+      ctx->set_output_ref(0, &mu_, table_handle_.AccessTensor(ctx));
+    }
+    table_handle_set_ = true;
   }
 
   ~LookupTableOp() override {
@@ -213,6 +225,15 @@ class HashTable : public InitializableLookupTable {
     return Status::OK();
   }
 
+  int64 MemoryUsed() const override {
+    if (table_) {
+      const int64 num_elements = table_->size();
+      return num_elements * (sizeof(K) + sizeof(V));
+    } else {
+      return 0;
+    }
+  }
+
  private:
   std::unique_ptr<std::unordered_map<K, V>> table_;
 };
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index f87ce0e6b206cd0066208fefd3cfa9ad18128cb6..d0f269be231500700c6084437fc7783a25d77960 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -49,26 +49,48 @@ Status GetLookupTable(const string& input_name, OpKernelContext* ctx,
                       LookupInterface** table) {
   string container;
   string table_handle;
-  TF_RETURN_IF_ERROR(
-      GetTableHandle(input_name, ctx, &container, &table_handle));
-  return ctx->resource_manager()->Lookup(container, table_handle, table);
+  DataType handle_dtype;
+  TF_RETURN_IF_ERROR(ctx->input_dtype(input_name, &handle_dtype));
+  if (handle_dtype == DT_RESOURCE) {
+    ResourceHandle handle;
+    TF_RETURN_IF_ERROR(HandleFromInput(ctx, input_name, &handle));
+    return LookupResource(ctx, handle, table);
+  } else {
+    TF_RETURN_IF_ERROR(
+        GetTableHandle(input_name, ctx, &container, &table_handle));
+    return ctx->resource_manager()->Lookup(container, table_handle, table);
+  }
 }
 
 Status GetInitializableLookupTable(const string& input_name,
                                    OpKernelContext* ctx,
                                    InitializableLookupTable** table) {
-  string container;
-  string table_handle;
-  TF_RETURN_IF_ERROR(
-      GetTableHandle(input_name, ctx, &container, &table_handle));
   LookupInterface* lookup_table;
-  TF_RETURN_IF_ERROR(
-      ctx->resource_manager()->Lookup(container, table_handle, &lookup_table));
-  *table = lookup_table->GetInitializableLookupTable();
-  if (*table == nullptr) {
-    lookup_table->Unref();
-    return errors::InvalidArgument("Table ", container, " ", table_handle,
-                                   " is not initializable");
+  DataType handle_dtype;
+  TF_RETURN_IF_ERROR(ctx->input_dtype(input_name, &handle_dtype));
+  if (handle_dtype == DT_RESOURCE) {
+    ResourceHandle handle;
+    TF_RETURN_IF_ERROR(HandleFromInput(ctx, input_name, &handle));
+    TF_RETURN_IF_ERROR(LookupResource(ctx, handle, &lookup_table));
+    *table = lookup_table->GetInitializableLookupTable();
+    if (*table == nullptr) {
+      lookup_table->Unref();
+      return errors::InvalidArgument("Table ", handle.container(), " ",
+                                     handle.name(), " is not initializable");
+    }
+  } else {
+    string container;
+    string table_handle;
+    TF_RETURN_IF_ERROR(
+        GetTableHandle(input_name, ctx, &container, &table_handle));
+    TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup(container, table_handle,
+                                                       &lookup_table));
+    *table = lookup_table->GetInitializableLookupTable();
+    if (*table == nullptr) {
+      lookup_table->Unref();
+      return errors::InvalidArgument("Table ", container, " ", table_handle,
+                                     " is not initializable");
+    }
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index 3435486c9539abee6b5403253a1504438d0daff5..c905ebc84a6e9251a5e30be19b086d3fae215cad 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -79,11 +79,11 @@ struct LaunchLRN<CPUDevice, T> {
     const int rows = static_cast<int>(in.dim_size(1));
     const int cols = static_cast<int>(in.dim_size(2));
     const int depth = static_cast<int>(in.dim_size(3));
-    const int nodes = cols * rows;
 
 #if defined(IS_MOBILE_PLATFORM)
     SingleThreadedLRN(in, batch, rows, cols, depth, output);
 #else
+    const int nodes = cols * rows;
     if (depth > kSingleThreadedLRNDepthCutoff &&
         (beta_ == T(0.5) || beta_ == T(1))) {
       SingleThreadedLRN(in, batch, rows, cols, depth, output);
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
index 0572d48b3ecb01360e0ca5ab71d923ad3c7a53ee..0fd3d12a5c9f766831d9b76a1c33bd7d200a8cc1 100644
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 // See docs in ../ops/linalg_ops.cc.
 
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/LU"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -26,22 +30,22 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#endif
+
 namespace tensorflow {
 
 template <class Scalar>
 class MatrixInverseOp : public LinearAlgebraOp<Scalar> {
  public:
-  typedef LinearAlgebraOp<Scalar> Base;
+  INHERIT_LINALG_TYPEDEFS(Scalar);
 
   explicit MatrixInverseOp(OpKernelConstruction* context) : Base(context) {
     OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
   }
 
-  using Matrix = typename Base::Matrix;
-  using MatrixMaps = typename Base::MatrixMaps;
-  using ConstMatrixMap = typename Base::ConstMatrixMap;
-  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
-
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
@@ -77,6 +81,138 @@ class MatrixInverseOp : public LinearAlgebraOp<Scalar> {
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixInverseOp);
 };
 
+#if GOOGLE_CUDA
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <class Scalar>
+class MatrixInverseOpGpu : public AsyncOpKernel {
+ public:
+  explicit MatrixInverseOpGpu(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+    const int ndims = input.dims();
+    const int64 n = input.dim_size(ndims - 1);
+    // Validate inputs.
+    OP_REQUIRES_ASYNC(
+        context, ndims >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
+        done);
+    OP_REQUIRES_ASYNC(
+        context, input.dim_size(ndims - 2) == n,
+        errors::InvalidArgument("Input matrices must be squares, got",
+                                input.dim_size(ndims - 2), " != ", n),
+        done);
+
+    // Allocate output.
+    Tensor* out;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->forward_input_or_allocate_output({0}, 0, input.shape(), &out),
+        done);
+
+    // By definition, an empty matrix's inverse is an empty matrix.
+    if (input.NumElements() == 0) {
+      done();
+      return;
+    }
+
+    // Make a copy of the (possible adjointed) input that we will use for the
+    // factorization step.
+    Tensor input_copy;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_temp(DataTypeToEnum<Scalar>::value,
+                                                input.shape(), &input_copy),
+                         done);
+    const GPUDevice& d = context->eigen_device<GPUDevice>();
+    auto input_copy_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
+    auto input_reshaped = input.template flat_inner_dims<Scalar, 3>();
+    if (!adjoint_) {
+      d.memcpy(input_copy_reshaped.data(), input_reshaped.data(),
+               input.NumElements() * sizeof(Scalar));
+    } else {
+      functor::AdjointBatchFunctor<GPUDevice, Scalar> functor;
+      functor(d, input_reshaped, input_copy_reshaped);
+    }
+    const int64 batch_size = input_copy_reshaped.dimension(0);
+
+    // Allocate pivots on the device.
+    ScratchSpace<int> pivots(context, n * batch_size, /* on_host */ false);
+
+    // Prepare pointer arrays for cuBlas' batch interface.
+    // TODO(rmlarsen): Find a way to encode pointer arrays in pinned host memory
+    // without the ugly casting.
+    ScratchSpace<uint8> input_copy_ptrs(context, sizeof(Scalar*) * batch_size,
+                                        /* on_host */ true);
+    ScratchSpace<uint8> output_ptrs(context, sizeof(Scalar*) * batch_size,
+                                    /* on_host */ true);
+    const Scalar** input_copy_ptrs_base =
+        reinterpret_cast<const Scalar**>(input_copy_ptrs.mutable_data());
+    const Scalar** output_ptrs_base =
+        reinterpret_cast<const Scalar**>(output_ptrs.mutable_data());
+    auto output_reshaped = out->template flat_inner_dims<Scalar, 3>();
+    for (int64 i = 0; i < batch_size; ++i) {
+      input_copy_ptrs_base[i] = input_copy_reshaped.data() + i * n * n;
+      output_ptrs_base[i] = output_reshaped.data() + i * n * n;
+    }
+
+    // Launch the two solver kernels back to back without waiting.
+    // 1. Compute the partially pivoted LU factorization(s) of the
+    // matrix/matrices.
+    CudaSolver solver(context);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.emplace_back(context, batch_size, "getrf");
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.GetrfBatched(n, input_copy_ptrs_base, n, pivots.mutable_data(),
+                            &dev_info.back(), batch_size),
+        done);
+    // 2. Compute the inverse(s).
+    dev_info.emplace_back(context, batch_size, "getri");
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.GetriBatched(n, input_copy_ptrs_base, n, pivots.data(),
+                            output_ptrs_base, n, &dev_info.back(), batch_size),
+        done);
+
+    // Register callback to check info after kernels finish.
+    auto info_checker = [context, done](
+                            const Status& status,
+                            const std::vector<HostLapackInfo>& host_infos) {
+      if (!status.ok() && errors::IsInvalidArgument(status) &&
+          !host_infos.empty()) {
+        for (int i = 0; i < host_infos[0].size(); ++i) {
+          // Match the CPU error message for singular matrices. Otherwise
+          // just print the original error message from the call itself
+          // below.
+          OP_REQUIRES_ASYNC(context, host_infos[0].data()[i] <= 0,
+                            errors::InvalidArgument("Input is not invertible."),
+                            done);
+        }
+      }
+      OP_REQUIRES_OK_ASYNC(context, status, done);
+      done();
+    };
+
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.CopyLapackInfoToHostAsync(dev_info, std::move(info_checker)),
+        done);
+  }
+
+ private:
+  bool adjoint_;
+};
+
+REGISTER_LINALG_OP_GPU("MatrixInverse", (MatrixInverseOpGpu<float>), float);
+REGISTER_LINALG_OP_GPU("MatrixInverse", (MatrixInverseOpGpu<double>), double);
+
+#endif  // GOOGLE_CUDA
+
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<float>), float);
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<double>), double);
 REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<float>), float);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 27930c44a653e735f52f38001e4c43b10b4720b0..d8a8cc74bfae08e2eeeba65e40a8e77e6ac8a1fd 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -265,6 +265,7 @@ class MklConcatOp : public OpKernel {
           s.GetDimension() > 0 ? s.GetSizes()[concat_dim] : 1;
     }
     mkl_context.MklCreateInputLayouts(context, input_shapes);
+    OP_REQUIRES_OK(context, context->status());
 
     CHECK_EQ(dnnConcatCreate_F32(&mkl_context.prim_concat, NULL, N,
                                  &mkl_context.lt_inputs[0]),
@@ -316,12 +317,14 @@ class MklConcatOp : public OpKernel {
 
     mkl_context.mkl_tmp_tensors.resize(N);
     mkl_context.MklPrepareConcatInputs(context, input_tensors);
+    OP_REQUIRES_OK(context, context->status());
 
     // Execute primitive.
     CHECK_EQ(dnnExecute_F32(mkl_context.prim_concat, mkl_context.concat_res),
              E_SUCCESS);
 
     mkl_context.MklCleanup();
+    OP_REQUIRES_OK(context, context->status());
   }
 
  private:
@@ -442,7 +445,7 @@ class MklConcatOp : public OpKernel {
                               .HostMemory("concat_dim")                     \
                               .Label(mkl_op_registry::kMklOpLabel),         \
                           MklConcatOp<CPUDevice, type, NAME_IS_CONCAT_DIM>) \
-  REGISTER_KERNEL_BUILDER(Name("_MklConcatV2")                               \
+  REGISTER_KERNEL_BUILDER(Name("_MklConcatV2")                              \
                               .Device(DEVICE_CPU)                           \
                               .TypeConstraint<type>("T")                    \
                               .TypeConstraint<int32>("Tidx")                \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index 8a1006a8e958fffefbf792ad51803599ce36e336..d4364d31e41790241454050750ecb58d31a0e941 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -38,9 +38,9 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#include "tensorflow/core/util/mkl_util.h"
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
@@ -252,7 +252,7 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 };
 
 #define REGISTER_CPU_KERNELS(T)                                     \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBiasBackpropBias")     \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBiasBackpropBias")    \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 6381b527a1bda5ab60fe89a2e7fccd404f008537..dc6b88e953a6eac204f247e8e0aa69c4a1d05314 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -37,9 +37,9 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#include "tensorflow/core/util/mkl_util.h"
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
@@ -266,8 +266,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
     int input_offsets[2];
     size_t conv_strides[2];
     MklShape input_shape, grad_filter_shape, out_backprop_shape;
-    dnnPrimitive_t prim_conv_bwdfilter, convert_bwdfilter;
-    dnnLayout_t lt_input, lt_grad_filter, lt_out_backprop;
+    dnnPrimitive_t prim_conv_bwdfilter = nullptr;
+    dnnPrimitive_t convert_bwdfilter = nullptr;
+    dnnLayout_t lt_input = nullptr;
+    dnnLayout_t lt_grad_filter = nullptr;
+    dnnLayout_t lt_out_backprop = nullptr;
     void* conv_res[dnnResourceNumber];
 
     void MklCleanup() {
@@ -409,7 +412,7 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
 };
 
 #define REGISTER_MKL_FILTER_KERNELS(T)                              \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")           \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 638ce4c0243538da904f05b3d86565560c418e26..c97f1dd7b737aca4d9c5790dc73d9032fef74b78 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -42,6 +40,8 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
 
 namespace tensorflow {
 
@@ -342,7 +342,7 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 };
 
 #define REGISTER_MKL_CPU_KERNELS(T)                                 \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")            \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index b818819b020ebafec08ee3bf595f3768f34b2347..76b9f1798ddafcde4b25d086d1445f282559a2e4 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -36,9 +36,9 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+#include "tensorflow/core/util/mkl_util.h"
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
@@ -98,19 +98,18 @@ class MklConv2DOp : public OpKernel {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(
-          context,
-          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
-          errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+                                           std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
     }
 
     const int64 input_depth =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
                             : GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(context, input_depth == filter.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", input_depth,
-                    " vs ", filter.dim_size(2)));
+    OP_REQUIRES(
+        context, input_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                input_depth, " vs ", filter.dim_size(2)));
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
 
@@ -119,10 +118,9 @@ class MklConv2DOp : public OpKernel {
     const int64 input_rows_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
                             : GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
@@ -131,10 +129,9 @@ class MklConv2DOp : public OpKernel {
     const int64 input_cols_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
                             : GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
@@ -142,10 +139,9 @@ class MklConv2DOp : public OpKernel {
     const int64 input_batch_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
                             : GetTensorDim(input, data_format_, 'N');
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("batch is too large"));
+    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(input_batch_raw);
 
     // For now we take the stride from the second and third dimensions only (we
@@ -438,12 +434,12 @@ class MklConv2DOp : public OpKernel {
 };
 
 #define REGISTER_MKL_CPU(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                        \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklConv2DOp<CPUDevice, T, false>);        \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                 \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 512e799d152342c6b806d14f1d1185281df21876..d53353680247bac3228629d2901b2ca8592d96d5 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -326,7 +326,7 @@ class MklFusedBatchNormOp : public OpKernel {
 };
 
 #define REGISTER_MKL_CPU(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                 \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
@@ -677,7 +677,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
 };
 
 #define REGISTER_MKL_CPU(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNormGrad")             \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNormGrad")            \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb7ea7e7f90546ceb23564d09c9e064b80347148
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -0,0 +1,63 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+template <typename Device, typename T>
+class MklIdentityOp : public OpKernel {
+ public:
+  explicit MklIdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklShape mkl_shape_input;
+    GetMklShape(context, 0, &mkl_shape_input);
+    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
+
+    if (input_in_mkl_format) {
+      ForwarMklTensorInToOut(context, 0, 0);
+    } else {
+      FowardTfTensorInToOut(context, 0, 0);
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklIdentity")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklIdentityOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index edca8e2553d10f2e2560b292a2d30955040f1925..070aeff49fe220881089e8f405fa8f89916c52ac 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -104,6 +104,15 @@ class MklLRNOp : public OpKernel {
       return;
     }
 
+    // TODO(inteltf) MKL will support depth radius not equal to 2 in the future
+    if (depth_radius_ != 2) {
+      Tensor converted_tensor =
+          ConvertMklToTF<T>(context, input, mkl_context.input_shape);
+      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
+                                    beta_, converted_tensor);
+      return;
+    }
+
     if (input_in_mkl_format) {
       // MKL supports normalization over channel dimension only
       if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
@@ -112,8 +121,10 @@ class MklLRNOp : public OpKernel {
             static_cast<dnnLayout_t>(mkl_context.input_shape.GetCurLayout());
         workspace_enabled_ = true;
       } else {
+        Tensor converted_tensor =
+            ConvertMklToTF<T>(context, input, mkl_context.input_shape);
         mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                      beta_, input);
+                                      beta_, converted_tensor);
         return;
       }
     }
@@ -160,9 +171,7 @@ class MklLRNOp : public OpKernel {
     MklShape input_shape;
     dnnPrimitive_t lrn_fwd = nullptr;
     dnnPrimitive_t convert_input = nullptr;
-    /* dnnPrimitive_t convert_output; */
     dnnLayout_t lt_input = nullptr;
-    /* dnnLayout_t lt_output; */
     dnnLayout_t lt_internal_input = nullptr;
     dnnLayout_t lt_internal_workspace = nullptr;
     dnnLayout_t lt_internal_output = nullptr;
@@ -267,7 +276,7 @@ class MklLRNOp : public OpKernel {
     }
 
     // Fallback implementation - Taken from lrn_op.cc
-    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
     // copy.
     void MklDefaultToEigen(OpKernelContext* context, int depth_radius_,
                            float bias_, float alpha_, float beta_,
@@ -378,6 +387,7 @@ class MklLRNGradOp : public OpKernel {
       mkl_context.MklDefaultToEigen(context);
       return;
     }
+
     if (ingrad_in_mkl_format || inimage_in_mkl_format) {
       const MklShape* tmp_mkl_shape = (ingrad_in_mkl_format)
                                           ? &mkl_context.ingrad_shape
@@ -459,11 +469,11 @@ class MklLRNGradOp : public OpKernel {
         const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
 
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_image_buf_tensor,
-        mkl_tmp_outimage_buf_tensor, mkl_tmp_workspace_buf_tensor;
+        mkl_tmp_outimage_buf_tensor;
     // Convert Inputs if needed
-    mkl_context.MklPrepareLRNGradInput(
-        context, &mkl_tmp_input_buf_tensor, &mkl_tmp_image_buf_tensor,
-        &mkl_tmp_outimage_buf_tensor, &mkl_tmp_workspace_buf_tensor);
+    mkl_context.MklPrepareLRNGradInput(context, &mkl_tmp_input_buf_tensor,
+                                       &mkl_tmp_image_buf_tensor,
+                                       &mkl_tmp_outimage_buf_tensor);
 
     // We do not do any conversion for output. But we simply emit it
     // in MKL format.
@@ -489,14 +499,11 @@ class MklLRNGradOp : public OpKernel {
     MklShape ingrad_shape, inimage_shape, outimage_shape;
     dnnPrimitive_t lrn_bwd = nullptr;
     dnnPrimitive_t convert_input = nullptr;
-    /* dnnPrimitive_t convert_output; */
     dnnLayout_t lt_input = nullptr;
     dnnLayout_t lt_output = nullptr;
     dnnLayout_t lt_bdw_input = nullptr;
     dnnLayout_t lt_workspace = nullptr;
     dnnLayout_t lt_internal_input = nullptr;
-    /* dnnLayout_t lt_internal_workspace;
-    dnnLayout_t lt_internal_output; */
     void* res_lrn_bwd[dnnResourceNumber];
 
     // prepare mkl input
@@ -523,11 +530,13 @@ class MklLRNGradOp : public OpKernel {
     void MklPrepareLRNGradInput(OpKernelContext* context,
                                 Tensor* mkl_tmp_input_buf_tensor,
                                 Tensor* mkl_tmp_image_buf_tensor,
-                                Tensor* mkl_tmp_outimage_buf_tensor,
-                                Tensor* mkl_tmp_workspace_buf_tensor) {
+                                Tensor* mkl_tmp_outimage_buf_tensor) {
       const Tensor& in_grads = MklGetInput(context, 0);
       const Tensor& in_image = MklGetInput(context, 1);
       const Tensor& out_image = MklGetInput(context, 2);
+      const Tensor& workspace = MklGetInput(
+          context,
+          3); /*Worskpsace is enabled, get the buffer to the workspace */
 
       void* user_input = const_cast<void*>(
           static_cast<const void*>(in_grads.flat<T>().data()));
@@ -535,6 +544,9 @@ class MklLRNGradOp : public OpKernel {
           static_cast<const void*>(in_image.flat<T>().data()));
       void* user_fwd_output = const_cast<void*>(
           static_cast<const void*>(out_image.flat<T>().data()));
+      void* workspace_buffer = const_cast<void*>(
+          static_cast<const void*>(workspace.flat<T>().data()));
+
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, lrn_bwd,
                                                 dnnResourceWorkspace),
                E_SUCCESS);
@@ -609,9 +621,7 @@ class MklLRNGradOp : public OpKernel {
         res_lrn_bwd[dnnResourceDst] = user_fwd_output;
       }
 
-      // Allocate buffer for workspace.
-      AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor, lt_workspace,
-                     &res_lrn_bwd[dnnResourceWorkspace]);
+      res_lrn_bwd[dnnResourceWorkspace] = workspace_buffer;
     }
 
     // Fallback implementation - Taken from lrn_op.cc
@@ -619,14 +629,36 @@ class MklLRNGradOp : public OpKernel {
     // copy.
     void MklDefaultToEigen(OpKernelContext* context) {
       // CHECK(false);
-      Tensor in_grads = MklGetInput(context, 0);
-      Tensor in_image = MklGetInput(context, 1);
-      Tensor out_image = MklGetInput(context, 2);
+
+      Tensor in_grads;
+      Tensor in_image;
+      Tensor out_image;
 
       GetMklShape(context, 0, &ingrad_shape);
       GetMklShape(context, 1, &inimage_shape);
       GetMklShape(context, 2, &outimage_shape);
 
+      if (ingrad_shape.IsMklTensor()) {
+        in_grads =
+            ConvertMklToTF<T>(context, MklGetInput(context, 0), ingrad_shape);
+      } else {
+        in_grads = MklGetInput(context, 0);
+      }
+
+      if (inimage_shape.IsMklTensor()) {
+        in_image =
+            ConvertMklToTF<T>(context, MklGetInput(context, 1), inimage_shape);
+      } else {
+        in_image = MklGetInput(context, 1);
+      }
+
+      if (outimage_shape.IsMklTensor()) {
+        out_image =
+            ConvertMklToTF<T>(context, MklGetInput(context, 2), outimage_shape);
+      } else {
+        out_image = MklGetInput(context, 2);
+      }
+
       const int64 batch = static_cast<int64>(in_grads.dim_size(0));
       const int64 rows = static_cast<int64>(in_grads.dim_size(1));
       const int64 cols = static_cast<int64>(in_grads.dim_size(2));
@@ -677,7 +709,7 @@ class MklLRNGradOp : public OpKernel {
       Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
             depth * depth, shard);
     }
-
+		
     // release mkl resources
     void Mklcleanup() {
       bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
@@ -704,12 +736,12 @@ class MklLRNGradOp : public OpKernel {
 };
 
 #define REGISTER_MKL_LRN_CPU(T)                                     \
-  REGISTER_KERNEL_BUILDER(Name("_MklLRN")                            \
+  REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklLRNOp<T>);                             \
-  REGISTER_KERNEL_BUILDER(Name("_MklLRNGrad")                        \
+  REGISTER_KERNEL_BUILDER(Name("_MklLRNGrad")                       \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 3ba28c13ed555cd569dea0621aae2a170e194bf3..16143191a34ae62704691f4916ac8f30d897f1d4 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -25,11 +25,11 @@ limitations under the License.
 
 #if defined(INTEL_MKL)
 
+#include "third_party/mkl/include/mkl_cblas.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "third_party/mkl/include/mkl_cblas.h"
 
 namespace tensorflow {
 
@@ -56,11 +56,11 @@ class MklMatMulOp : public OpKernel {
     dim_pair[0].first = transpose_a_ ? 0 : 1;
     dim_pair[0].second = transpose_b_ ? 1 : 0;
 
-    OP_REQUIRES(ctx,
-                a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-                errors::InvalidArgument("Matrix size-incompatible: In[0]: ",
-                                        a.shape().DebugString(), ", In[1]: ",
-                                        b.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+        errors::InvalidArgument(
+            "Matrix size-incompatible: In[0]: ", a.shape().DebugString(),
+            ", In[1]: ", b.shape().DebugString()));
     int a_dim_remaining = 1 - dim_pair[0].first;
     int b_dim_remaining = 1 - dim_pair[0].second;
     TensorShape out_shape(
@@ -199,15 +199,13 @@ class MklMatMulOp : public OpKernel {
   }
 };
 
-#define REGISTER_CPU(T)                                                      \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"),              \
-      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);       \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("MKL"), \
-      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>)
+#define REGISTER_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);
 
-// TODO:Consider template specialization when adding/removing additional types
+// TODO(inteltf) Consider template specialization when adding/removing
+// additional types
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 TF_CALL_complex64(REGISTER_CPU);
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index ba2d347d94162612afa7d955da12692f6380f4bc..1e0ee258b09f40ad6849375e75b1492675a027cb 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -276,11 +276,6 @@ class MklMaxPoolingGradOp : public OpKernel {
     mkl_context.pooling_res[dnnResourceDiffSrc] = const_cast<void*>(
         static_cast<const void*>(output_tensor->flat<T>().data()));
 
-    int64 output_size = output_tensor->NumElements();
-    for (int64 i = 0; i < output_size; ++i) {
-      (static_cast<float*>(mkl_context.pooling_res[dnnResourceDiffSrc]))[i] = 0;
-    }
-
     CHECK_EQ(
         dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
         E_SUCCESS);
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 25c8359cc533f84dee3d214f0b4c603c407276f4..10d2937584ddcd5178f1be75bab980ab00fb05d1 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -16,17 +16,17 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
 #include "tensorflow/core/platform/default/logging.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
 
 namespace tensorflow {
 
@@ -194,45 +194,28 @@ class MklReluGradOp : public OpKernel {
 
       void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
       void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &mkl_lt_internal_grad, prim_relu_bwd, dnnResourceDiffDst),
-               E_SUCCESS);
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
-                                                prim_relu_bwd, dnnResourceSrc),
-               E_SUCCESS);
-
-      if (!dnnLayoutCompare_F32(mkl_lt_internal_grad, lt_grad)) {
-        AllocTmpBuffer(context, mkl_tmp_grad_buf_tensor, mkl_lt_internal_grad,
-                       &relu_res[dnnResourceDiffDst]);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_user_to_reluB_grad, lt_grad,
-                                         mkl_lt_internal_grad),
-                 E_SUCCESS);
-        CHECK_EQ(dnnConversionExecute_F32(cv_user_to_reluB_grad, user_g,
-                                          relu_res[dnnResourceDiffDst]),
+      dnnPrimitive_t cv_input_to_grad = NULL;
+      Tensor mkl_tmp_buf_tensor;
+      void* mkl_buffer_convert = nullptr;
+
+      // if input and grad are not in the same layout, do a conversion between
+      // them.
+      if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
+        AllocTmpBuffer(context, &mkl_tmp_buf_tensor, lt_grad,
+                       &mkl_buffer_convert);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
                  E_SUCCESS);
-        dnnDelete_F32(cv_user_to_reluB_grad);
-      } else {
-        relu_res[dnnResourceDiffDst] = user_g;
-      }
 
-      if (!dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input)) {
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &relu_res[dnnResourceSrc]);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_user_to_reluB_input, lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        CHECK_EQ(dnnConversionExecute_F32(cv_user_to_reluB_input, user_i,
-                                          relu_res[dnnResourceSrc]),
+        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, user_i,
+                                          mkl_buffer_convert),
                  E_SUCCESS);
-        dnnDelete_F32(cv_user_to_reluB_input);
+        relu_res[dnnResourceSrc] = mkl_buffer_convert;
+        dnnDelete_F32(cv_input_to_grad);
       } else {
         relu_res[dnnResourceSrc] = user_i;
       }
 
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-      dnnLayoutDelete_F32(mkl_lt_internal_grad);
+      relu_res[dnnResourceDiffDst] = user_g;
     }
 
     void MklCreateInputLayouts(OpKernelContext* context) {
@@ -331,7 +314,7 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.MklCreateInputLayouts(context);
   float negative_slope = 0.0;
   CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
-                                     mkl_context.lt_grad, mkl_context.lt_input,
+                                     mkl_context.lt_grad, mkl_context.lt_grad,
                                      negative_slope),
            E_SUCCESS);
   Tensor mkl_tmp_grad_buf_tensor, mkl_tmp_input_buf_tensor;
@@ -380,12 +363,12 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
 /* Register DNN kernels for supported operations and supported types - right now
  * it is only Relu and f32*/
 #define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
-  REGISTER_KERNEL_BUILDER(Name("_MklRelu")                           \
+  REGISTER_KERNEL_BUILDER(Name("_MklRelu")                          \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklReluOp<CPUDevice, type>);              \
-  REGISTER_KERNEL_BUILDER(Name("_MklReluGrad")                       \
+  REGISTER_KERNEL_BUILDER(Name("_MklReluGrad")                      \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 753a8b52b42b5541013a8d114e20bdf08d613e58..593aa3a2fd6052f275015b1acd2e6f5271a837dd 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -129,7 +129,7 @@ class MklReshapeOp : public OpKernel {
         return;
       }
     } else {
-      CopyTFTensorInToOut(context, 0, 0, shape);
+      CopyTfTensorInToOutWithShape(context, 0, 0, shape);
     }
   }
 };
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
index c31ef5c2554829da631ca878ab809e27cc0663af..588d6874dd635b89863141a3eccd005bcf6f0317 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -106,7 +106,7 @@ class MklToTfOp : public OpKernel {
 ///////////////////////////////////////////////////////////
 
 #define REGISTER_CPU(T)                                             \
-  REGISTER_KERNEL_BUILDER(Name("MklToTf")                           \
+  REGISTER_KERNEL_BUILDER(Name("_MklToTf")                          \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 4d4851c70cb5543df75e2cf46f8fc53f88f3ae7e..dc95f67ff00df04ddf26655eb01e76e12062dfe0 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -89,6 +90,63 @@ static inline float ComputeIOU(typename TTypes<float, 2>::ConstTensor boxes,
   return intersection_area / (area_i + area_j - intersection_area);
 }
 
+void DoNonMaxSuppressionOp(OpKernelContext* context,
+                           const Tensor& boxes,
+                           const Tensor& scores,
+                           const Tensor& max_output_size,
+                           const float iou_threshold) {
+  OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
+      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+  
+  int num_boxes = 0;
+  ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
+  if (!context->status().ok()) {
+    return;
+  }
+
+  const int output_size =
+      std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data =
+      boxes.tensor<float, 2>();
+
+  std::vector<float> scores_data(num_boxes);
+  std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
+  std::vector<int> sorted_indices;
+  DecreasingArgSort(scores_data, &sorted_indices);
+
+  std::vector<bool> active(num_boxes, true);
+  std::vector<int> selected;
+  int num_active = active.size();
+  for (int i = 0; i < num_boxes; ++i) {
+    if (num_active == 0 || selected.size() >= output_size) break;
+    if (active[i]) {
+      selected.push_back(sorted_indices[i]);
+    } else {
+      continue;
+    }
+    for (int j = i + 1; j < num_boxes; ++j) {
+      if (active[j]) {
+        float iou =
+          ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
+        if (iou > iou_threshold) {
+          active[j] = false;
+          num_active--;
+        }
+      }
+    }
+  }
+
+  // Allocate output tensor
+  Tensor* output = nullptr;
+  TensorShape output_shape({static_cast<int>(selected.size())});
+  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+  typename TTypes<int, 1>::Tensor selected_indices_data =
+      output->tensor<int, 1>();
+  std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
+}
+
+} // namespace
+
 template <typename Device>
 class NonMaxSuppressionOp : public OpKernel {
  public:
@@ -98,9 +156,6 @@ class NonMaxSuppressionOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
-    OP_REQUIRES(context, iou_threshold_ >= 0 && iou_threshold_ <= 1,
-                errors::InvalidArgument("iou_threshold must be in [0, 1]"));
-
     // boxes: [num_boxes, 4]
     const Tensor& boxes = context->input(0);
     // scores: [num_boxes]
@@ -112,59 +167,48 @@ class NonMaxSuppressionOp : public OpKernel {
         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
                                 max_output_size.shape().DebugString()));
 
-    int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
-    if (!context->status().ok()) {
-      return;
-    }
-
-    const int output_size =
-        std::min(max_output_size.scalar<int>()(), num_boxes);
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-
-    std::vector<float> scores_data(num_boxes);
-    std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
-    std::vector<int> sorted_indices;
-    DecreasingArgSort(scores_data, &sorted_indices);
-
-    std::vector<bool> active(num_boxes, true);
-    std::vector<int> selected;
-    int num_active = active.size();
-    for (int i = 0; i < num_boxes; ++i) {
-      if (num_active == 0 || selected.size() >= output_size) break;
-      if (active[i]) {
-        selected.push_back(sorted_indices[i]);
-      } else {
-        continue;
-      }
-      for (int j = i + 1; j < num_boxes; ++j) {
-        if (active[j]) {
-          float iou =
-              ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
-          if (iou > iou_threshold_) {
-            active[j] = false;
-            num_active--;
-          }
-        }
-      }
-    }
-
-    // Allocate output tensor
-    Tensor* output = nullptr;
-    TensorShape output_shape({static_cast<int>(selected.size())});
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-    typename TTypes<int, 1>::Tensor selected_indices_data =
-        output->tensor<int, 1>();
-    std::copy_n(selected.begin(), selected.size(),
-                selected_indices_data.data());
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size, iou_threshold_);
   }
 
  private:
   float iou_threshold_;
 };
 
+template <typename Device>
+class NonMaxSuppressionV2Op : public OpKernel {
+ public:
+  explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
+      : OpKernel(context) {
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // boxes: [num_boxes, 4]
+    const Tensor& boxes = context->input(0);
+    // scores: [num_boxes]
+    const Tensor& scores = context->input(1);
+    // max_output_size: scalar
+    const Tensor& max_output_size = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
+        errors::InvalidArgument("max_output_size must be 0-D, got shape ",
+                                max_output_size.shape().DebugString()));
+    // iou_threshold: scalar
+    const Tensor& iou_threshold = context->input(3);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                iou_threshold.shape().DebugString()));
+
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
+
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size, iou_threshold_val);
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
                         NonMaxSuppressionOp<CPUDevice>);
 
+REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2").Device(DEVICE_CPU),
+                        NonMaxSuppressionV2Op<CPUDevice>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 72e368db77321e56dad780673de12078013953bb..0a075b48b0aea4a1944c9a402aa13d9afcd40a64 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -141,6 +141,161 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
   AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
   AddInputFromArray<int>(TensorShape({}), {30});
   Status s = RunOpKernel();
+  
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+              StringPiece(s.ToString()).contains("scores has incompatible shape"))
+    << s;
+}
+
+TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
+  MakeOp(1.2);
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+              StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+    << s;
+}
+
+TEST_F(NonMaxSuppressionOpTest, TestEmptyInput) {
+  MakeOp(.5);
+  AddInputFromArray<float>(TensorShape({0, 4}), {});
+  AddInputFromArray<float>(TensorShape({0}), {});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({0}));
+  test::FillValues<int>(&expected, {});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+    
+//
+// NonMaxSuppressionV2Op Tests
+//
+
+class NonMaxSuppressionV2OpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    TF_EXPECT_OK(NodeDefBuilder("non_max_suppression_op", "NonMaxSuppressionV2")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectFromThreeClustersFlippedCoordinates) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f,  1, -0.1f,
+                            0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {2});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectSingleBox) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectFromTenIdenticalBoxes) {
+  MakeOp();
+
+  int num_boxes = 10;
+  std::vector<float> corners(num_boxes * 4);
+  std::vector<float> scores(num_boxes);
+  for (int i = 0; i < num_boxes; ++i) {
+    corners[i * 4 + 0] = 0;
+    corners[i * 4 + 1] = 0;
+    corners[i * 4 + 2] = 1;
+    corners[i * 4 + 3] = 1;
+    scores[i] = .9;
+  }
+  AddInputFromArray<float>(TensorShape({num_boxes, 4}), corners);
+  AddInputFromArray<float>(TensorShape({num_boxes}), scores);
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestInconsistentBoxAndScoreShapes) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  Status s = RunOpKernel();
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
@@ -148,11 +303,12 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
       << s;
 }
 
-TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
-  MakeOp(1.2);
+TEST_F(NonMaxSuppressionV2OpTest, TestInvalidIOUThreshold) {
+  MakeOp();
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<float>(TensorShape({1}), {.9f});
   AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {1.2f});
   Status s = RunOpKernel();
 
   ASSERT_FALSE(s.ok());
@@ -161,11 +317,12 @@ TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
       << s;
 }
 
-TEST_F(NonMaxSuppressionOpTest, TestEmptyInput) {
-  MakeOp(.5);
+TEST_F(NonMaxSuppressionV2OpTest, TestEmptyInput) {
+  MakeOp();
   AddInputFromArray<float>(TensorShape({0, 4}), {});
   AddInputFromArray<float>(TensorShape({0}), {});
   AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_INT32, TensorShape({0}));
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index b885c5ce095f44a602adc6a6fdd05e32ac7af387..2174098bdefd0bd5e2ccc49dc7dc094845179d41 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -22,10 +22,83 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
+namespace {
+const Node* FindNodeByName(const string& name, const Graph& graph) {
+  for (const Node* node : graph.nodes()) {
+    CHECK_NOTNULL(node);
+    if (node->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+std::unordered_set<string> BuildNodeSetFromNodeNamesAndPorts(
+    const std::vector<string>& node_names_and_ports) {
+  std::unordered_set<string> retval;
+  for (const string& node_name_and_port : node_names_and_ports) {
+    const TensorId tid = ParseTensorName(node_name_and_port);
+    retval.emplace(tid.first.ToString());
+  }
+  return retval;
+}
+
+Node* FindMutableNodeByName(const string& name, Graph* graph) {
+  for (Node* node : graph->nodes()) {
+    if (node != nullptr && node->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+const NodeDef* FindNodeDefByName(const string& input,
+                                 const GraphDef& graph_def) {
+  const TensorId tid = ParseTensorName(input);
+  const string name = tid.first.ToString();
+  for (const NodeDef& node_def : graph_def.node()) {
+    if (node_def.name() == name) {
+      return &node_def;
+    }
+  }
+  return nullptr;
+}
+
+string DumpGraphDef(const GraphDef& graph_def) {
+  string out;
+  for (const NodeDef& node : graph_def.node()) {
+    out += strings::StrCat("node: ", node.name(), "\n    input: ");
+    for (const string& input : node.input()) {
+      out += strings::StrCat(input, ", ");
+    }
+    out += "\n";
+  }
+  return out;
+}
+
+string DumpCluster(const RemoteFusedGraphExecuteUtils::ClusterInfo& cluster) {
+  string out;
+  out += "Nodes:\n";
+  for (const string& str : std::get<0>(cluster)) {
+    out += str + ", ";
+  }
+  out += "\nInput border:\n";
+  for (const string& str : std::get<1>(cluster)) {
+    out += str + ", ";
+  }
+  out += "\nOutput border:\n";
+  for (const string& str : std::get<2>(cluster)) {
+    out += str + ", ";
+  }
+  return out;
+}
+
+}  // namespace
 
 /* static */ constexpr const char* const
     RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES;
@@ -260,17 +333,17 @@ RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap(
 }
 
 /* static */ Status RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-    const NodeDef& node_def, std::vector<DataType>* data_types,
+    AttrSlice attrs, std::vector<DataType>* data_types,
     std::vector<TensorShape>* shapes) {
   Status status;
   if (data_types != nullptr) {
-    status = GetNodeAttr(node_def, ATTR_OUTPUT_DATA_TYPES, data_types);
+    status = GetNodeAttr(attrs, ATTR_OUTPUT_DATA_TYPES, data_types);
   }
   if (!status.ok()) {
     return status;
   }
   if (shapes != nullptr) {
-    status = GetNodeAttr(node_def, ATTR_OUTPUT_SHAPES, shapes);
+    status = GetNodeAttr(attrs, ATTR_OUTPUT_SHAPES, shapes);
     if (status.ok() && data_types != nullptr) {
       CHECK_EQ(data_types->size(), shapes->size());
     }
@@ -279,6 +352,26 @@ RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap(
   return status;
 }
 
+/* static */ bool RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+    const GraphDef& graph_def, const string& name_and_port, DataType* data_type,
+    TensorShape* shape) {
+  std::vector<DataType> data_types;
+  std::vector<TensorShape> shapes;
+  const TensorId tid = ParseTensorName(name_and_port);
+  const string node_name = tid.first.ToString();
+  const int port = tid.second;
+  const NodeDef* node_def = FindNodeDefByName(node_name, graph_def);
+  CHECK_NOTNULL(node_def);
+  GetOutputTensorShapeType(*node_def, &data_types, &shapes).IgnoreError();
+  if (data_types.empty()) {
+    return false;
+  }
+  CHECK(data_types.size() > port);
+  *data_type = data_types.at(port);
+  *shape = shapes.at(port);
+  return true;
+}
+
 /* static */ Status RemoteFusedGraphExecuteUtils::PropagateShapeInference(
     const GraphDef& graph_def,
     const std::vector<std::pair<string, Tensor>>& input_node_info_list,
@@ -417,4 +510,513 @@ RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto(
                      std::make_pair(tensor.dtype(), tensor.shape())));
 }
 
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+    const std::vector<std::pair<string, Tensor>>& input_tensors,
+    const bool dry_run_inference, GraphDef* graph_def) {
+  TensorShapeMap tensor_shape_map;
+  if (dry_run_inference) {
+    TF_RETURN_IF_ERROR(DryRunInferenceForAllNode(*graph_def, input_tensors,
+                                                 /*initialize_by_zero=*/true,
+                                                 &tensor_shape_map));
+  } else {
+    ImportGraphDefOptions opts;
+    Graph graph(OpRegistry::Global());
+    ShapeRefiner shape_refiner(graph.versions().producer(),
+                               graph.op_registry());
+    TF_RETURN_IF_ERROR(
+        ImportGraphDef(opts, *graph_def, &graph, &shape_refiner));
+    TF_RETURN_IF_ERROR(PropagateShapeInference(*graph_def, input_tensors,
+                                               &graph, &shape_refiner));
+    TF_RETURN_IF_ERROR(
+        BuildTensorShapeMapFromGraph(graph, shape_refiner, &tensor_shape_map));
+  }
+
+  for (NodeDef& node_def : *graph_def->mutable_node()) {
+    TF_RETURN_IF_ERROR(
+        AddOutputTensorShapeTypeByTensorShapeMap(tensor_shape_map, &node_def));
+  }
+
+  return Status::OK();
+}
+
+/* static */ Status
+RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteInfo(
+    const string& executor_name, const GraphDef& subgraph_def,
+    const std::vector<string>& inputs, const std::vector<string>& outputs,
+    const bool require_shape_type, RemoteFusedGraphExecuteInfo* execute_info,
+    DataTypeVector* input_types, DataTypeVector* output_types) {
+  CHECK_NOTNULL(execute_info);
+  CHECK_NOTNULL(input_types);
+  CHECK_NOTNULL(output_types);
+
+  execute_info->Clear();
+  execute_info->set_executor_name(executor_name);
+
+  // copy graph
+  *execute_info->mutable_remote_graph() = subgraph_def;
+
+  for (const string& input : inputs) {
+    DataType dt;
+    TensorShape shape;
+    const bool has_shapetype =
+        GetOutputTensorShapeType(subgraph_def, input, &dt, &shape);
+
+    execute_info->add_graph_input_node_name(input);
+    if (has_shapetype) {
+      RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
+          *execute_info->add_default_graph_input_tensor_shape();
+      tensor_shape_type.set_dtype(dt);
+      TensorShapeProto& tensor_shape_proto = *tensor_shape_type.mutable_shape();
+      for (const int64 dim : shape.dim_sizes()) {
+        tensor_shape_proto.add_dim()->set_size(dim);
+      }
+      input_types->push_back(dt);
+    } else {
+      CHECK(!require_shape_type)
+          << "No shape type found for " << input << DumpGraphDef(subgraph_def);
+      // Assuming input type is float if no data provided.
+      input_types->push_back(DT_FLOAT);
+    }
+  }
+
+  for (const string& output : outputs) {
+    DataType dt;
+    TensorShape shape;
+    const bool has_shapetype =
+        GetOutputTensorShapeType(subgraph_def, output, &dt, &shape);
+
+    execute_info->add_graph_output_node_name(output);
+    if (has_shapetype) {
+      RemoteFusedGraphExecuteInfo::TensorShapeTypeProto&
+          tensor_shape_type_proto =
+              *execute_info->add_default_graph_output_tensor_shape();
+      tensor_shape_type_proto.set_dtype(dt);
+      TensorShapeProto& tensor_shape_proto =
+          *tensor_shape_type_proto.mutable_shape();
+      for (const int64 dim : shape.dim_sizes()) {
+        tensor_shape_proto.add_dim()->set_size(dim);
+      }
+      output_types->push_back(dt);
+    } else {
+      CHECK(!require_shape_type)
+          << "No shape type found for " << output << DumpGraphDef(subgraph_def);
+      // Assuming output type is float if no data provided.
+      output_types->push_back(DT_FLOAT);
+    }
+  }
+
+  return Status::OK();
+}
+
+/* static */ Status
+RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
+    const string& node_name, const string& executor_name,
+    const GraphDef& subgraph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs, const bool require_shape_type,
+    Graph* graph, Node** created_node) {
+  CHECK_NOTNULL(graph);
+  CHECK_NOTNULL(created_node);
+
+  RemoteFusedGraphExecuteInfo execute_info;
+  DataTypeVector input_types;
+  DataTypeVector output_types;
+
+  TF_CHECK_OK(RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteInfo(
+      executor_name, subgraph_def, inputs, outputs, require_shape_type,
+      &execute_info, &input_types, &output_types));
+
+  std::vector<NodeBuilder::NodeOut> node_out_list;
+  for (const string& input : inputs) {
+    const TensorId tid = ParseTensorName(input);
+    Node* node = FindMutableNodeByName(tid.first.ToString(), graph);
+    CHECK_NOTNULL(node);
+    node_out_list.emplace_back(node, tid.second);
+  }
+
+  const string execute_info_str = execute_info.SerializeAsString();
+
+  auto builder =
+      NodeBuilder(node_name, "RemoteFusedGraphExecute")
+          .Input(node_out_list)
+          .Attr("Tinputs", input_types)
+          .Attr("Toutputs", output_types)
+          .Attr("serialized_remote_fused_graph_execute_info", execute_info_str);
+
+  TF_RETURN_IF_ERROR(builder.Finalize(graph, created_node));
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildIdentityOpNode(
+    const string& node_name, const string& input_node_name,
+    const int input_node_port, const DataType dt, Graph* graph,
+    Node** created_node) {
+  Node* node = FindMutableNodeByName(input_node_name, graph);
+  CHECK_NOTNULL(node);
+  NodeBuilder::NodeOut node_out(node, input_node_port);
+
+  auto builder =
+      NodeBuilder(node_name, "Identity").Input(node_out).Attr("T", dt);
+
+  TF_RETURN_IF_ERROR(builder.Finalize(graph, created_node));
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+    const std::unordered_set<string>& node_names, const GraphDef& graph_def,
+    std::vector<ClusterInfo>* cluster_infos) {
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
+  std::unordered_set<string> remaining_nodes = node_names;
+
+  while (!remaining_nodes.empty()) {
+    ClusterInfo ci;
+
+    // Determine one cluster nodes
+    std::unordered_set<const Node*> visited;
+    std::deque<const Node*> queue;
+    queue.emplace_back(FindNodeByName(*remaining_nodes.begin(), graph));
+    while (!queue.empty()) {
+      const Node* node = queue.front();
+      CHECK_NOTNULL(node);
+      queue.pop_front();
+      const string& node_name = node->name();
+      if (node_names.count(node_name) > 0) {
+        std::get<0>(ci).emplace(node_name);
+        remaining_nodes.erase(node_name);
+      } else {
+        // Edge of subgraph.  Do nothing.
+        continue;
+      }
+      for (const Node* in : node->in_nodes()) {
+        if (visited.insert(in).second) {
+          queue.push_back(in);
+        }
+      }
+      for (const Node* out : node->out_nodes()) {
+        if (visited.insert(out).second) {
+          queue.push_back(out);
+        }
+      }
+    }
+
+    // Determine one cluster border
+    std::vector<string>& border_inputs = std::get<1>(ci);
+    std::vector<string>& border_outputs = std::get<2>(ci);
+    for (const string& node_name : node_names) {
+      Node* node = FindMutableNodeByName(node_name, &graph);
+      CHECK_NOTNULL(node);
+      int input_count = 0;
+      for (const Edge* in_edge : node->in_edges()) {
+        const Node* src_node = in_edge->src();
+        const bool src_is_outside =
+            node_names.count(src_node->name()) <= 0 && !src_node->IsSource();
+        if (src_is_outside) {
+          const string src_name =
+              strings::StrCat(src_node->name(), ":", in_edge->src_output());
+          CHECK_EQ(1, src_node->num_outputs())
+              << "output count of input border node must be one."
+              << src_node->name();
+          if (std::find(border_inputs.begin(), border_inputs.end(), src_name) ==
+              border_inputs.end()) {
+            border_inputs.emplace_back(src_name);
+          }
+        } else {
+          ++input_count;
+        }
+      }
+      CHECK(input_count == 0 || input_count == node->in_edges().size());
+
+      for (const Edge* out_edge : node->out_edges()) {
+        const Node* dst_node = out_edge->dst();
+        CHECK_NOTNULL(dst_node);
+        const bool dst_is_outside = node_names.count(dst_node->name()) <= 0;
+        const string dst_name =
+            strings::StrCat(node->name(), ":", out_edge->src_output());
+        if (dst_is_outside) {
+          if (dst_node->IsSink()) {
+            CHECK_EQ(1, node->num_outputs())
+                << "If you want to specify output node as subgraph output node "
+                << "the output count of the node must be 1 "
+                << "because that node is replaced by identity node.";
+            const string identity_dst_name =
+                strings::StrCat(node->name(), ":", 0);
+            if (std::find(border_outputs.begin(), border_outputs.end(),
+                          identity_dst_name) == border_outputs.end()) {
+              border_outputs.emplace_back(identity_dst_name);
+            }
+          } else {
+            if (std::find(border_outputs.begin(), border_outputs.end(),
+                          dst_name) == border_outputs.end()) {
+              border_outputs.emplace_back(dst_name);
+            }
+          }
+        }
+      }
+    }
+    cluster_infos->emplace_back(ci);
+    VLOG(1) << DumpCluster(ci);
+  }
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+    const ClusterInfo& cluster, const GraphDef& graph_def,
+    GraphDef* subgraph_def) {
+  const std::unordered_set<string>& node_names = std::get<0>(cluster);
+  const std::unordered_set<string>& border_input_names =
+      BuildNodeSetFromNodeNamesAndPorts(std::get<1>(cluster));
+
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
+
+  for (Node* node : graph.nodes()) {
+    if (node != nullptr && node_names.count(node->name()) <= 0 &&
+        border_input_names.count(node->name()) <= 0 && !node->IsSource() &&
+        !node->IsSink()) {
+      graph.RemoveNode(node);
+    }
+  }
+  graph.ToGraphDef(subgraph_def);
+
+  for (const string& subgraph_input : std::get<1>(cluster)) {
+    const TensorId tid = ParseTensorName(subgraph_input);
+    const string subgraph_input_name = tid.first.ToString();
+    const int subgraph_input_port = tid.second;
+    const NodeDef* node_def = FindNodeDefByName(subgraph_input_name, graph_def);
+    CHECK_NOTNULL(node_def);
+    std::vector<DataType> dt_vec;
+    std::vector<TensorShape> shape_vec;
+    GetOutputTensorShapeType(*node_def, &dt_vec, &shape_vec).IgnoreError();
+    const DataType& dt =
+        dt_vec.empty() ? DT_FLOAT : dt_vec.at(subgraph_input_port);
+    const TensorShape& shape =
+        shape_vec.empty() ? TensorShape({}) : shape_vec.at(subgraph_input_port);
+
+    TF_RETURN_IF_ERROR(ReplaceInputNodeByPlaceHolder(subgraph_input_name, dt,
+                                                     shape, subgraph_def));
+  }
+  VLOG(1) << DumpGraphDef(*subgraph_def);
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+    const std::vector<string>& border_inputs,
+    const std::vector<string>& border_outputs, const GraphDef& graph_def,
+    ClusterInfo* cluster) {
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
+
+  std::unordered_set<const Node*> visited;
+  std::deque<const Node*> queue;
+  for (const string& output : border_outputs) {
+    const TensorId tid = ParseTensorName(output);
+    const string& output_node_name = tid.first.ToString();
+    for (const Node* node : graph.nodes()) {
+      if (output_node_name == node->name()) {
+        queue.push_back(node);
+        visited.insert(node);
+      }
+    }
+  }
+
+  std::unordered_set<const Node*> border_input_nodes;
+  // propagate visit to parent nodes until input nodes
+  while (!queue.empty()) {
+    const Node* node = queue.front();
+    queue.pop_front();
+    for (const Edge* edge : node->in_edges()) {
+      const Node* src_node = edge->src();
+      CHECK_NOTNULL(src_node);
+      const int src_port = edge->src_output();
+      bool input_found = false;
+      for (const string& input : border_inputs) {
+        const TensorId tid = ParseTensorName(input);
+        if (tid.first.ToString() == src_node->name() &&
+            tid.second == src_port) {
+          input_found = true;
+          border_input_nodes.insert(src_node);
+        }
+      }
+      if (visited.insert(src_node).second) {
+        if (!input_found) {
+          queue.push_back(src_node);
+        }
+      }
+    }
+  }
+
+  for (const Node* node : visited) {
+    if (node != nullptr && !node->IsSource() && !node->IsSink() &&
+        border_input_nodes.count(node) <= 0) {
+      std::get<0>(*cluster).insert(node->name());
+    }
+  }
+  std::get<1>(*cluster) = border_inputs;
+  std::get<2>(*cluster) = border_outputs;
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseCluster(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name, const ClusterInfo& cluster,
+    const string& remote_graph_executor_name, const bool require_shape_type,
+    GraphDef* output_graph_def) {
+  LOG(INFO) << "Transforming quantized stripped model to a remote fused "
+               "graph execute op by fusing a specified subgraph...";
+
+  CHECK(!remote_graph_executor_name.empty());
+
+  const std::vector<string>& border_inputs = std::get<1>(cluster);
+  const std::vector<string>& border_outputs = std::get<2>(cluster);
+
+  GraphDef subgraph_def;
+  TF_RETURN_IF_ERROR(
+      BuildClusterSubgraphDef(cluster, input_graph_def, &subgraph_def));
+
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(
+      ImportGraphDef({}, input_graph_def, &graph, &shape_refiner));
+
+  Node* fused_node;
+  TF_RETURN_IF_ERROR(BuildRemoteFusedGraphExecuteOpNode(
+      remote_fused_graph_node_name, remote_graph_executor_name, subgraph_def,
+      border_inputs, border_outputs, require_shape_type, &graph, &fused_node));
+
+  for (const Node* node : graph.nodes()) {
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      const Edge* edge = nullptr;
+      TF_RETURN_IF_ERROR(node->input_edge(i, &edge));
+      for (int j = 0; j < border_outputs.size(); ++j) {
+        const string& output = border_outputs.at(j);
+        const TensorId tid = ParseTensorName(output);
+        const string output_name = tid.first.ToString();
+        Node* src_node = edge->src();
+        if (src_node != nullptr && src_node->name() == output_name &&
+            edge->src_output() == tid.second) {
+          // Source node is replaced by new fused node.
+          Node* dst_node = edge->dst();
+          const int dst_input = edge->dst_input();
+          LOG(INFO) << "Removing existing edge to " << edge->dst()->name()
+                    << " from " << edge->src()->name();
+          graph.RemoveEdge(edge);
+          graph.AddEdge(fused_node, j, dst_node, dst_input);
+        }
+      }
+    }
+  }
+
+  // Replace output nodes by identity nodes which forward outputs from
+  // RemoteFusedGraphExecuteOpNode
+  for (const string& output : outputs) {
+    const TensorId output_tid = ParseTensorName(output);
+    const string output_name = output_tid.first.ToString();
+    for (int i = 0; i < border_outputs.size(); ++i) {
+      const TensorId subgraph_output_tid =
+          ParseTensorName(border_outputs.at(i));
+      const string& subgraph_output_name = subgraph_output_tid.first.ToString();
+      if (output_name == subgraph_output_name) {
+        LOG(INFO) << "As graph output and subgraph output are same, "
+                  << "the graph output node is replaced by identity node";
+        Node* original_output_node = FindMutableNodeByName(output, &graph);
+        CHECK_NOTNULL(original_output_node);
+        CHECK_EQ(1, original_output_node->num_outputs())
+            << "Num outputs should be 1 for " << output << ".";
+        graph.RemoveNode(original_output_node);
+        Node* new_node;
+        TF_RETURN_IF_ERROR(BuildIdentityOpNode(output,
+                                               remote_fused_graph_node_name, i,
+                                               DT_FLOAT, &graph, &new_node));
+        CHECK_NOTNULL(new_node);
+      }
+    }
+  }
+
+  GraphDef result_graph_def;
+
+  graph.ToGraphDef(&result_graph_def);
+
+  ClusterInfo graph_cluster;
+  TF_RETURN_IF_ERROR(
+      BuildClusterByBorder(inputs, outputs, result_graph_def, &graph_cluster));
+
+  // Remove unvisited nodes
+  TF_RETURN_IF_ERROR(BuildClusterSubgraphDef(graph_cluster, result_graph_def,
+                                             output_graph_def));
+
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name_prefix,
+    const std::unordered_set<string>& subgraph_nodes,
+    const string& remote_fused_graph_executor_name,
+    const bool require_shape_type, GraphDef* output_graph_def) {
+  std::vector<ClusterInfo> ci_vec;
+  TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      subgraph_nodes, input_graph_def, &ci_vec));
+
+  for (int i = 0; i < ci_vec.size(); ++i) {
+    const string remote_fused_graph_node_name =
+        strings::StrCat(remote_fused_graph_node_name_prefix, "/", i);
+    TF_RETURN_IF_ERROR(FuseCluster(input_graph_def, inputs, outputs,
+                                   remote_fused_graph_node_name, ci_vec.at(i),
+                                   remote_fused_graph_executor_name,
+                                   require_shape_type, output_graph_def));
+  }
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseRemoteGraphByBorder(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name,
+    const std::vector<string>& border_inputs,
+    const std::vector<string>& border_outputs,
+    const string& remote_graph_executor_name, const bool require_shape_type,
+    GraphDef* output_graph_def) {
+  ClusterInfo cluster;
+  TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      border_inputs, border_outputs, input_graph_def, &cluster));
+
+  return FuseCluster(
+      input_graph_def, inputs, outputs, remote_fused_graph_node_name, cluster,
+      remote_graph_executor_name, require_shape_type, output_graph_def);
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::ReplaceInputNodeByPlaceHolder(
+    const string& input, const DataType type, const TensorShape& shape,
+    GraphDef* graph_def) {
+  const TensorId tid = ParseTensorName(input);
+  CHECK_EQ(0, tid.second);
+  const string node_name = tid.first.ToString();
+  for (NodeDef& node : *graph_def->mutable_node()) {
+    if (node.name() != node_name) {
+      continue;
+    }
+    if (node.op() == "Placeholder") {
+      return Status::OK();
+    } else {
+      NodeDef placeholder_node;
+      placeholder_node.set_op("Placeholder");
+      placeholder_node.set_name(node_name);
+      AddNodeAttr("dtype", type, &placeholder_node);
+      AddNodeAttr("shape", shape, &placeholder_node);
+      // TODO(satok): Remove once we merge attributes
+      AddOutputTensorShapeType({type}, {shape}, &placeholder_node);
+      node.Clear();
+      node = placeholder_node;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument(
+      strings::StrCat(node_name, " not found for replacement."));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
index a71047d42d9ed27b3d0a67e783477880532ca786..97b0c2008a766dccec392995bd05156688b5b5e7 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
 
 #include <unordered_map>
+#include <unordered_set>
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
@@ -52,9 +53,12 @@ class RemoteFusedGraphExecuteUtils {
   using ExecutorBuildRegistry = std::map<string, ExecutorBuildFunc>;
 
   using TensorShapeType = std::pair<DataType, TensorShape>;
-  using TensorShapeMap =
-      std::unordered_multimap<string /* node name */,
-                              std::pair<int /* port */, TensorShapeType>>;
+  using TensorShapeMap = std::unordered_multimap<string,         // node name
+                                                 std::pair<int,  // port
+                                                           TensorShapeType>>;
+  using ClusterInfo = std::tuple<std::unordered_set<string>,  // node names
+                                 std::vector<string>,         // border inputs
+                                 std::vector<string>>;        // border outputs
 
   // Return registered ExecutorBuildFunc for given name.
   static const ExecutorBuildFunc* GetExecutorBuildFunc(const string& name);
@@ -99,10 +103,14 @@ class RemoteFusedGraphExecuteUtils {
   static Status AddOutputTensorShapeTypeByTensorShapeMap(
       const TensorShapeMap& tensor_shape_map, NodeDef* node_def);
 
-  static Status GetOutputTensorShapeType(const NodeDef& node_def,
+  static Status GetOutputTensorShapeType(AttrSlice attrs,
                                          std::vector<DataType>* data_types,
                                          std::vector<TensorShape>* shapes);
 
+  static bool GetOutputTensorShapeType(const GraphDef& graph_def,
+                                       const string& name_and_port,
+                                       DataType* data_type, TensorShape* shape);
+
   static Status PropagateShapeInference(
       const GraphDef& graph_def,
       const std::vector<std::pair<string, Tensor>>& input_node_info_list,
@@ -124,10 +132,92 @@ class RemoteFusedGraphExecuteUtils {
       std::vector<std::pair<string, Tensor>>* inputs,
       std::vector<string>* outputs);
 
+  static Status BuildAndAddTensorShapes(
+      const std::vector<std::pair<string, Tensor>>& input_tensors,
+      const bool dry_run_inference, GraphDef* graph_def);
+
+  // Build remote fused graph execute info
+  static Status BuildRemoteFusedGraphExecuteInfo(
+      const string& executor_name, const GraphDef& subgraph_def,
+      const std::vector<string>& inputs, const std::vector<string>& outputs,
+      const bool require_shape_type, RemoteFusedGraphExecuteInfo* execute_info,
+      DataTypeVector* input_types, DataTypeVector* output_types);
+
+  // Build remote fused graph execute op node by fusing specified subgraph
+  // as remote fused graph execute info
+  static Status BuildRemoteFusedGraphExecuteOpNode(
+      const string& node_name, const string& executor_name,
+      const GraphDef& subgraph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs, const bool require_shape_type,
+      Graph* graph, Node** created_node);
+
+  // Build Identity node to forward remote graph node output
+  static Status BuildIdentityOpNode(const string& node_name,
+                                    const string& input_node_name,
+                                    const int input_node_port,
+                                    const DataType dt, Graph* graph,
+                                    Node** created_node);
+
+  // Create clusters of given nodes
+  static Status ClusterizeNodes(const std::unordered_set<string>& node_names,
+                                const GraphDef& graph_def,
+                                std::vector<ClusterInfo>* cluster_infos);
+
+  // Build GraphDef of a given cluster
+  static Status BuildClusterSubgraphDef(const ClusterInfo& cluster,
+                                        const GraphDef& graph_def,
+                                        GraphDef* subgraph_def);
+
+  // Build a cluster by given border
+  // CAVEAT: The border must be consistent for one cluster.
+  static Status BuildClusterByBorder(const std::vector<string>& border_inputs,
+                                     const std::vector<string>& border_outputs,
+                                     const GraphDef& graph_def,
+                                     ClusterInfo* cluster);
+
+  // Fuse one cluster into a newly created RemoteFusedGraphExecuteOp node.
+  // The subgraph is stored as a graph in RemoteFusedGraphExecuteInfo.
+  // CAVEAT1: This transform strips unvisited nodes with given outputs.
+  // CAVEAT2: If you want to use a graph output as a border output,
+  // that graph output node is replaced by an identity node.  Therefore,
+  // the number of output of the node must be 1.
+  static Status FuseCluster(const GraphDef& input_graph_def,
+                            const std::vector<string>& inputs,
+                            const std::vector<string>& outputs,
+                            const string& remote_fused_graph_node_name,
+                            const ClusterInfo& cluster,
+                            const string& remote_graph_executor_name,
+                            const bool require_shape_type,
+                            GraphDef* output_graph_def);
+
+  // Fuse subgraph of specified nodes
+  static Status FuseRemoteGraphByNodeNames(
+      const GraphDef& input_graph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs,
+      const string& remote_fused_graph_node_name_prefix,
+      const std::unordered_set<string>& subgraph_nodes,
+      const string& remote_fused_graph_executor_name,
+      const bool require_shape_type, GraphDef* output_graph_def);
+
+  // Fuse subgraph of specified border
+  static Status FuseRemoteGraphByBorder(
+      const GraphDef& input_graph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs,
+      const string& remote_fused_graph_node_name,
+      const std::vector<string>& border_inputs,
+      const std::vector<string>& border_outputs,
+      const string& remote_graph_executor_name, const bool require_shape_type,
+      GraphDef* output_graph_def);
+
  private:
   static void EmplaceTensorShapeType(const string& name, const Tensor& tensor,
                                      TensorShapeMap* tensor_shape_map);
 
+  static Status ReplaceInputNodeByPlaceHolder(const string& input,
+                                              const DataType type,
+                                              const TensorShape& shape,
+                                              GraphDef* graph_def);
+
   static ExecutorBuildRegistry* GetExecutorBuildRegistry();
 
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteUtils);
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
index 52afa5dde11cb9f38c479678ba6c58d216f69022..8bd63d996a7f27d7396266a63ebc5673bc25d55c 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
@@ -15,18 +15,24 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
+namespace {
 
-constexpr const char* const NAME_A = "a";
-constexpr const char* const NAME_B = "b";
-constexpr const char* const NAME_A_PLUS_B = "a_plus_b";
+using ClusterInfo = RemoteFusedGraphExecuteUtils::ClusterInfo;
+
+constexpr const char* const NAME_A = "A";
+constexpr const char* const NAME_B = "B";
+constexpr const char* const NAME_A_PLUS_B = "A_PLUS_B";
 constexpr float NODE_A_VAL = 2.0f;
 constexpr float NODE_B_VAL = 3.0f;
 constexpr float VALUE_TOLERANCE_FLOAT = 1e-8f;
@@ -41,6 +47,154 @@ static NodeDef* GetNodeDef(const string& name, GraphDef* def) {
   return nullptr;
 }
 
+// This function builds the following graph
+//
+//  A         B         C         D         E
+//  |         |         |         |         |
+//  +----+----+         |         +----+----+
+//       |              |              |
+//       F             / \             G
+//       |            |   |           / \
+//       +-----+------+   +-----+----+   +
+//             |                |        |
+//             H                I        |
+//             |                |        |
+//             +-------+--------+        |
+//                     |                 |
+//                     J                 |
+//                     |                 |
+//                     +--------+--------+
+//                              |
+//                              K
+//
+Status BuildMultipleAddGraph(GraphDef* graph_def) {
+  Scope root = tensorflow::Scope::NewRootScope();
+
+  Tensor a_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&a_data, 1.0f);
+  Output a_const = ops::Const(root.WithOpName("A"), Input::Initializer(a_data));
+
+  Tensor b_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&b_data, 1.0f);
+  Output b_const = ops::Const(root.WithOpName("B"), Input::Initializer(b_data));
+
+  Tensor c_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&c_data, 1.0f);
+  Output c_const = ops::Const(root.WithOpName("C"), Input::Initializer(c_data));
+
+  Tensor d_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&d_data, 1.0f);
+  Output d_const = ops::Const(root.WithOpName("D"), Input::Initializer(d_data));
+
+  Tensor e_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&e_data, 1.0f);
+  Output e_const = ops::Const(root.WithOpName("E"), Input::Initializer(e_data));
+
+  Output f_add = ops::Add(root.WithOpName("F"), a_const, b_const);
+
+  Output g_add = ops::Add(root.WithOpName("G"), d_const, e_const);
+
+  Output h_add = ops::Add(root.WithOpName("H"), f_add, c_const);
+
+  Output i_add = ops::Add(root.WithOpName("I"), c_const, g_add);
+
+  Output j_add = ops::Add(root.WithOpName("J"), h_add, i_add);
+
+  Output k_add = ops::Add(root.WithOpName("K"), j_add, g_add);
+
+  TF_RETURN_IF_ERROR(root.ToGraphDef(graph_def));
+
+  return Status::OK();
+}
+
+class FuseRemoteGraphMultipleAddOpsTest : public ::testing::Test {
+ protected:
+  void SetUp() final { TF_ASSERT_OK(BuildMultipleAddGraph(&graph_def_)); }
+
+  void TearDown() final {}
+
+  Status FuseByInOut() {
+    // Feed output shapes and types
+    RemoteFusedGraphExecuteUtils::TensorShapeMap tensor_shape_map;
+    GraphDef graph_def_with_shapetype = graph_def_;
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+        input_tensors_, /*dry_run_inference*/ true, &graph_def_with_shapetype));
+
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByBorder(
+        graph_def_with_shapetype, inputs_, outputs_,
+        "remote_fused_graph_node_names", subgraph_input_names_,
+        subgraph_output_names_, "remote_graph_executor_name",
+        /*require_shape_type=*/true, &result_graph_def_);
+  }
+
+  Status FuseByNodes() {
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
+        graph_def_, inputs_, outputs_, "remote_fused_graph_node_names",
+        subgraph_node_names_, "remote_graph_executor_name",
+        /*require_shape_type=*/false, &result_graph_def_);
+  }
+
+ public:
+  const std::vector<std::pair<string, Tensor>> input_tensors_{
+      {"A", {DT_FLOAT, {1, 1, 1, 1}}}};
+  const std::vector<string> inputs_{"A"};
+  const std::vector<string> outputs_{"K"};
+  GraphDef graph_def_;
+  GraphDef result_graph_def_;
+  std::vector<string> subgraph_input_names_;
+  std::vector<string> subgraph_output_names_;
+  std::unordered_set<string> subgraph_node_names_;
+};
+
+void SetSubgraphArguments(const std::vector<string>& input_names,
+                          const std::vector<string>& output_names,
+                          FuseRemoteGraphMultipleAddOpsTest* fixture) {
+  for (const string& input_name : input_names) {
+    fixture->subgraph_input_names_.emplace_back(input_name);
+  }
+
+  fixture->subgraph_output_names_ = output_names;
+}
+
+template <typename T>
+static string IterToString(const T& set) {
+  string out;
+  for (const string& val : set) {
+    if (!out.empty()) {
+      out += ", ";
+    }
+    out += val;
+  }
+  return out;
+}
+
+static string SummarizeGraphDef(const GraphDef& graph_def) {
+  string out;
+  for (const NodeDef& node : graph_def.node()) {
+    out += strings::StrCat("node: ", node.name(), "\n    input: ");
+    for (const string& input : node.input()) {
+      out += strings::StrCat(input, ", ");
+    }
+    out += "\n";
+  }
+  return out;
+}
+
+static string DumpInOutNames(const std::vector<ClusterInfo>& ci_vec) {
+  for (int i = 0; i < ci_vec.size(); ++i) {
+    LOG(INFO) << "Cluster(" << i << ")";
+    LOG(INFO) << "input: " << IterToString(std::get<1>(ci_vec.at(i)));
+    LOG(INFO) << "output: " << IterToString(std::get<2>(ci_vec.at(i)));
+  }
+  return "";
+}
+
+static void ClearCluster(ClusterInfo* cluster) {
+  std::get<0>(*cluster).clear();
+  std::get<1>(*cluster).clear();
+  std::get<2>(*cluster).clear();
+}
+
 TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphA) {
   GraphDef def = RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
       NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B);
@@ -104,7 +258,7 @@ TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphAB) {
 }
 
 TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphForAllNodes) {
-  // Set Node "a" as an input with value (= 1.0f)
+  // Set Node "A" as an input with value (= 1.0f)
   std::pair<string, Tensor> input_node_info_a;
   input_node_info_a.first = NAME_A;
   input_node_info_a.second = Tensor(DT_FLOAT, {});
@@ -226,4 +380,294 @@ TEST(RemoteFusedGraphExecuteUtils, PropagateAndBuildTensorShapeMap) {
   }
 }
 
+TEST(RemoteFusedGraphExecuteUtils,
+     BuildRemoteFusedGraphExecuteInfoWithShapeInference) {
+  // Build inputs
+  std::pair<string, Tensor> input_node_info_a;
+  input_node_info_a.first = NAME_A;
+  input_node_info_a.second = Tensor(DT_FLOAT, {});
+  input_node_info_a.second.scalar<float>()() = NODE_A_VAL;
+  std::pair<string, Tensor> input_node_info_b;
+  input_node_info_b.first = NAME_B;
+  input_node_info_b.second = Tensor(DT_FLOAT, {});
+  input_node_info_b.second.scalar<float>()() = NODE_B_VAL;
+  const std::vector<std::pair<string, Tensor>> input_tensors{input_node_info_a,
+                                                             input_node_info_b};
+  const std::vector<string> inputs{NAME_A, NAME_B};
+
+  // Build outputs
+  const std::vector<string> outputs = {NAME_A_PLUS_B};
+
+  GraphDef def = RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+      input_tensors, /*dry_run_inference*/ true, &def));
+
+  RemoteFusedGraphExecuteInfo execute_info0;
+  DataTypeVector input_types0;
+  DataTypeVector output_types0;
+
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteInfo(
+      "executor", def, inputs, outputs, /*require_shape_type=*/true,
+      &execute_info0, &input_types0, &output_types0));
+
+  EXPECT_EQ(inputs.size(),
+            execute_info0.default_graph_input_tensor_shape_size());
+  EXPECT_EQ(outputs.size(),
+            execute_info0.default_graph_output_tensor_shape_size());
+  EXPECT_EQ(inputs.size(), input_types0.size());
+  EXPECT_EQ(outputs.size(), output_types0.size());
+
+  EXPECT_EQ(def.node_size(), execute_info0.remote_graph().node_size());
+}
+
+TEST(RemoteFusedGraphExecuteUtils, BuildRemoteFusedGraphExecuteOpNode) {
+  const std::vector<string> inputs{NAME_A, NAME_B};
+
+  // Build outputs
+  const std::vector<string> outputs = {NAME_A_PLUS_B};
+
+  const GraphDef def = RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B);
+
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_ASSERT_OK(ImportGraphDef({}, def, &graph, &shape_refiner));
+
+  Node* node;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
+      "fused_name", "executor", def, inputs, outputs,
+      /*require_shape_type=*/false, &graph, &node));
+}
+
+TEST(RemoteFusedGraphExecuteUtils, ExtractSubgraphNodes) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(BuildMultipleAddGraph(&graph_def));
+  ClusterInfo cluster;
+  const std::unordered_set<string>& node_names = std::get<0>(cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"H", "I"}, {"J"}, graph_def, &cluster));
+  EXPECT_EQ(1, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"F", "C", "G"}, {"J"}, graph_def, &cluster));
+  EXPECT_EQ(3, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"A", "B", "C", "D", "E"}, {"J"}, graph_def, &cluster));
+  EXPECT_EQ(5, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"A", "B", "C", "D", "E"}, {"K"}, graph_def, &cluster));
+  EXPECT_EQ(6, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"F"}, {"H"}, graph_def, &cluster));
+  EXPECT_EQ(2, node_names.size()) << IterToString(node_names);
+}
+
+TEST(RemoteFusedGraphExecuteUtils, ClusterizeNodes) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(BuildMultipleAddGraph(&graph_def));
+
+  std::vector<ClusterInfo> ci_vec;
+  TF_ASSERT_OK(
+      RemoteFusedGraphExecuteUtils::ClusterizeNodes({"J"}, graph_def, &ci_vec));
+  ASSERT_EQ(1, ci_vec.size());
+  EXPECT_EQ(2, std::get<1>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+  EXPECT_EQ(1, std::get<2>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"H", "I", "J"}, graph_def, &ci_vec));
+  ASSERT_EQ(1, ci_vec.size());
+  EXPECT_EQ(3, std::get<1>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+  EXPECT_EQ(1, std::get<2>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"F", "C", "G", "H", "I", "J"}, graph_def, &ci_vec));
+  ASSERT_EQ(1, ci_vec.size());
+  EXPECT_EQ(4, std::get<1>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+  EXPECT_EQ(2, std::get<2>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"A", "B", "C", "D", "E"}, graph_def, &ci_vec));
+  ASSERT_EQ(5, ci_vec.size());
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"A", "B", "D", "E", "F", "G"}, graph_def, &ci_vec));
+  ASSERT_EQ(2, ci_vec.size());
+}
+
+TEST(RemoteFusedGraphExecuteUtils, BuildSubgraphDefByInOut) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(BuildMultipleAddGraph(&graph_def));
+
+  ClusterInfo cluster;
+  GraphDef subgraph_def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"H", "I"}, std::vector<string>{"J"}, graph_def,
+      &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(3, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"F", "C", "G"}, std::vector<string>{"J"}, graph_def,
+      &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(6, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"A", "B", "C", "D", "E"}, std::vector<string>{"J"},
+      graph_def, &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(10, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"A", "B", "C", "D", "E"}, std::vector<string>{"K"},
+      graph_def, &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(11, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"F"}, std::vector<string>{"H"}, graph_def, &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(3, subgraph_def.node_size());
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_hi_j) {
+  SetSubgraphArguments(std::vector<string>{"H", "I"}, std::vector<string>{"J"},
+                       this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(11, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_fcg_j) {
+  SetSubgraphArguments(std::vector<string>{"F", "C", "G"},
+                       std::vector<string>{"J"}, this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_abcde_j) {
+  SetSubgraphArguments(std::vector<string>{"A", "B", "C", "D", "E"},
+                       std::vector<string>{"J"}, this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(8, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_abcde_k) {
+  SetSubgraphArguments(std::vector<string>{"A", "B", "C", "D", "E"},
+                       std::vector<string>{"K"}, this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(7, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_h) {
+  subgraph_node_names_ = {"H"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(11, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_hij) {
+  subgraph_node_names_ = {"H", "I", "J"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_cfghij) {
+  subgraph_node_names_ = {"C", "F", "G", "H", "I", "J"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(6, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_abcdefghij) {
+  subgraph_node_names_ = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(3, result_graph_def_.node_size())  // "A", "RFG", "K"
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_abcdefghijk) {
+  subgraph_node_names_ = {"A", "B", "C", "D", "E", "F",
+                          "G", "H", "I", "J", "K"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(3, result_graph_def_.node_size())  // "A", "RFG", "K"
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
index a38fb222237f9f5938f6b68fd909fc9fb768fc66..bfd29b7ec89e6a2d0e2757db31b707be70d12c1d 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
+
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,13 +29,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h"
-#endif  // GOOGLE_CUDA
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 template <typename Device, typename T>
 class ResizeNearestNeighborOp : public OpKernel {
@@ -54,22 +53,27 @@ class ResizeNearestNeighborOp : public OpKernel {
                 errors::InvalidArgument("nearest neighbor requires max height "
                                         "& width of 2^24"));
 
+    // Return if the output is empty.
+    if (st.output->NumElements() == 0) return;
+
     typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
     typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();
 
-    for (int b = 0; b < st.batch_size; ++b) {
-      for (int y = 0; y < st.out_height; ++y) {
-        const int64 in_y =
-            std::min(static_cast<int64>(floorf(y * st.height_scale)),
-                     (st.in_height - 1));
-        for (int x = 0; x < st.out_width; ++x) {
-          const int64 in_x =
-              std::min(static_cast<int64>(floorf(x * st.width_scale)),
-                       (st.in_width - 1));
-          std::copy_n(&input_data(b, in_y, in_x, 0), st.channels,
-                      &output_data(b, y, x, 0));
-        }
-      }
+    bool status;
+    if (align_corners_) {
+      status =
+          functor::ResizeNearestNeighbor<Device, T, /*align_corners=*/true>()(
+              context->eigen_device<Device>(), input_data, st.height_scale,
+              st.width_scale, output_data);
+    } else {
+      status =
+          functor::ResizeNearestNeighbor<Device, T, /*align_corners=*/false>()(
+              context->eigen_device<Device>(), input_data, st.height_scale,
+              st.width_scale, output_data);
+    }
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching ResizeNearestNeighbor"));
     }
   }
 
@@ -77,6 +81,41 @@ class ResizeNearestNeighborOp : public OpKernel {
   bool align_corners_;
 };
 
+// Partial specialization of ResizeNearestNeighbor functor for a CPUDevice.
+namespace functor {
+template <typename T, bool align_corners>
+struct ResizeNearestNeighbor<CPUDevice, T, align_corners> {
+  bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int y = 0; y < out_height; ++y) {
+        const int64 in_y = std::min(
+            (align_corners) ? static_cast<int64>(roundf(y * height_scale))
+                            : static_cast<int64>(floorf(y * height_scale)),
+            in_height - 1);
+        for (int x = 0; x < out_width; ++x) {
+          const int64 in_x = std::min(
+              (align_corners) ? static_cast<int64>(roundf(x * width_scale))
+                              : static_cast<int64>(floorf(x * width_scale)),
+              in_width - 1);
+          std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0));
+        }
+      }
+    }
+    return true;
+  }
+};
+}  // namespace functor
+
 template <typename Device, typename T>
 class ResizeNearestNeighborOpGrad : public OpKernel {
  public:
@@ -105,22 +144,23 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
     OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
                 errors::InvalidArgument("shape_t's elements must be positive"));
 
-    // Initialize shape to the batch size of the input, then add
-    // the rest of the dimensions
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0,
-                                TensorShape({input.dim_size(0), sizes(0),
-                                             sizes(1), input.dim_size(3)}),
-                                &output));
-
     const int64 batch_size = input.dim_size(0);
     const int64 in_height = input.dim_size(1);
     const int64 in_width = input.dim_size(2);
     const int64 channels = input.dim_size(3);
 
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
+    const int64 out_height = sizes(0);
+    const int64 out_width = sizes(1);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({batch_size, out_height, out_width, channels}),
+            &output));
+
+    // Return if the output is empty.
+    if (output->NumElements() == 0) return;
 
     typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
     typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
@@ -129,28 +169,67 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
         CalculateResizeScale(out_height, in_height, align_corners_);
     const float width_scale =
         CalculateResizeScale(out_width, in_width, align_corners_);
-    output_data.setZero();
 
-    for (int c = 0; c < channels; ++c) {
-      for (int y = 0; y < in_height; ++y) {
-        const int64 out_y = std::min(
-            static_cast<int64>(floorf(y * height_scale)), (out_height - 1));
+    bool status;
+    if (align_corners_) {
+      status = functor::ResizeNearestNeighborGrad<Device, T,
+                                                  /*align_corners=*/true>()(
+          context->eigen_device<Device>(), input_data, height_scale,
+          width_scale, output_data);
+    } else {
+      status = functor::ResizeNearestNeighborGrad<Device, T,
+                                                  /*align_corners=*/false>()(
+          context->eigen_device<Device>(), input_data, height_scale,
+          width_scale, output_data);
+    }
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching ResizeNearestNeighborGrad"));
+    }
+  }
 
-        for (int x = 0; x < in_width; ++x) {
-          const int64 out_x = std::min(
-              static_cast<int64>(floorf(x * width_scale)), (out_width - 1));
+ private:
+  bool align_corners_;
+};
 
-          for (int b = 0; b < batch_size; ++b) {
-            output_data(b, out_y, out_x, c) += input_data(b, y, x, c);
+// Partial specialization of ResizeNearestNeighborGrad functor for a CPUDevice.
+namespace functor {
+template <typename T, bool align_corners>
+struct ResizeNearestNeighborGrad<CPUDevice, T, align_corners> {
+  bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    output.setZero();
+
+    for (int y = 0; y < in_height; ++y) {
+      const int64 out_y = std::min(
+          (align_corners) ? static_cast<int64>(roundf(y * height_scale))
+                          : static_cast<int64>(floorf(y * height_scale)),
+          out_height - 1);
+      for (int x = 0; x < in_width; ++x) {
+        const int64 out_x = std::min(
+            (align_corners) ? static_cast<int64>(roundf(x * width_scale))
+                            : static_cast<int64>(floorf(x * width_scale)),
+            out_width - 1);
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < channels; ++c) {
+            output(b, out_y, out_x, c) += input(b, y, x, c);
           }
         }
       }
     }
+    return true;
   }
-
- private:
-  bool align_corners_;
 };
+}  // namespace functor
 
 #define REGISTER_KERNEL(T)                                        \
   REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor")           \
@@ -170,121 +249,22 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
 
 #if GOOGLE_CUDA
 
-template <typename T>
-class ResizeNearestNeighborGPUOp : public OpKernel {
- public:
-  explicit ResizeNearestNeighborGPUOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    ImageResizerState st(align_corners_);
-    st.ValidateAndCreateOutput(context, input);
-    if (!context->status().ok()) return;
-
-    bool status = ResizeNearestNeighbor<T>(
-        input.flat<T>().data(), st.batch_size, st.in_height, st.in_width,
-        st.channels, st.out_height, st.out_width, st.height_scale,
-        st.width_scale, st.output->flat<T>().data(),
-        context->eigen_gpu_device());
-
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launching ResizeNearestNeighbor"));
-    }
-  }
-
- private:
-  bool align_corners_;
-};
-
-#define REGISTER_KERNEL(T)                              \
-  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor") \
-                              .Device(DEVICE_GPU)       \
-                              .TypeConstraint<T>("T")   \
-                              .HostMemory("size"),      \
-                          ResizeNearestNeighborGPUOp<T>);
+#define REGISTER_KERNEL(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor")           \
+                              .Device(DEVICE_GPU)                 \
+                              .TypeConstraint<T>("T")             \
+                              .HostMemory("size"),                \
+                          ResizeNearestNeighborOp<GPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighborGrad")       \
+                              .Device(DEVICE_GPU)                 \
+                              .TypeConstraint<T>("T")             \
+                              .HostMemory("size"),                \
+                          ResizeNearestNeighborOpGrad<GPUDevice, T>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
-template <typename T>
-class ResizeNearestNeighborGPUOpGrad : public OpKernel {
- public:
-  explicit ResizeNearestNeighborGPUOpGrad(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Grab and validate the input:
-    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, input.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
-
-    // Grab and validate the output shape:
-    const Tensor& shape_t = context->input(1);
-    OP_REQUIRES(context, shape_t.dims() == 1,
-                errors::InvalidArgument("shape_t must be 1-dimensional",
-                                        shape_t.shape().DebugString()));
-    OP_REQUIRES(context, shape_t.NumElements() == 2,
-                errors::InvalidArgument("shape_t must have two elements",
-                                        shape_t.shape().DebugString()));
-
-    auto sizes = shape_t.vec<int32>();
-    OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
-                errors::InvalidArgument("shape_t's elements must be positive"));
-
-    // Initialize shape to the batch size of the input, then add
-    // the rest of the dimensions
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0,
-                                TensorShape({input.dim_size(0), sizes(0),
-                                             sizes(1), input.dim_size(3)}),
-                                &output));
-
-    const int64 batch_size = input.dim_size(0);
-    const int64 in_height = input.dim_size(1);
-    const int64 in_width = input.dim_size(2);
-    const int64 channels = input.dim_size(3);
-
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
-
-    const float height_scale =
-        CalculateResizeScale(out_height, in_height, align_corners_);
-    const float width_scale =
-        CalculateResizeScale(out_width, in_width, align_corners_);
-
-    bool status = ResizeNearestNeighborBackward(
-        input.flat<T>().data(), batch_size, in_height, in_width, channels,
-        out_height, out_width, height_scale, width_scale,
-        output->flat<T>().data(), context->eigen_gpu_device());
-
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launching ResizeNearestNeighborGrad"));
-    }
-  }
-  bool align_corners_;
-};
-
-#define REGISTER_KERNEL(T)                                  \
-  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighborGrad") \
-                              .Device(DEVICE_GPU)           \
-                              .TypeConstraint<T>("T")       \
-                              .HostMemory("size"),          \
-                          ResizeNearestNeighborGPUOpGrad<T>);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNEL);
-
-#undef REGISTER_KERNEL
-
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.h b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9db331ffdcd6c1a1b11c3ab6271d0a949dec6630
--- /dev/null
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, bool align_corners>
+struct ResizeNearestNeighbor {
+  bool operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output);
+};
+
+template <typename Device, typename T, bool align_corners>
+struct ResizeNearestNeighborGrad {
+  bool operator()(const Device& d,
+                  typename TTypes<T, 4>::ConstTensor input_grad,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output_grad);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
index 1a3a64f482bac4d5ecb0aff3ad8d0b05fb8ab21c..d65c8fb949abe7227cbae9de36baeca4571b4ff4 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
@@ -19,21 +19,25 @@ limitations under the License.
 
 #include <stdio.h>
 
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h"
+#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
 namespace {
 
-template <typename T>
-__global__ void ResizeNearestNeighborNHWC(const int nthreads, const T* bottom_data,
-                                          const int in_height, const int in_width,
-                                          const int channels, const int out_height,
-                                          const int out_width, const float height_scale,
-                                          const float width_scale, T* top_data) {
+template <typename T, bool align_corners>
+__global__ void ResizeNearestNeighborNHWC(
+    const int nthreads, const T* bottom_data, const int in_height,
+    const int in_width, const int channels, const int out_height,
+    const int out_width, const float height_scale, const float width_scale,
+    T* top_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -44,20 +48,25 @@ __global__ void ResizeNearestNeighborNHWC(const int nthreads, const T* bottom_da
     n /= out_height;
 
     const T* bottom_data_n = bottom_data + n * channels * in_height * in_width;
-    const int in_x = min(static_cast<int>(floorf(out_x * width_scale)), in_width - 1);
-    const int in_y = min(static_cast<int>(floorf(out_y * height_scale)), in_height - 1);
+    const int in_y =
+        min((align_corners) ? static_cast<int>(roundf(out_y * height_scale))
+                            : static_cast<int>(floorf(out_y * height_scale)),
+            in_height - 1);
+    const int in_x =
+        min((align_corners) ? static_cast<int>(roundf(out_x * width_scale))
+                            : static_cast<int>(floorf(out_x * width_scale)),
+            in_width - 1);
     const int idx = (in_y * in_width + in_x) * channels + c;
     top_data[index] = ldg(bottom_data_n + idx);
   }
 }
 
-template <typename T>
+template <typename T, bool align_corners>
 __global__ void ResizeNearestNeighborBackwardNHWC(
-                                   const int nthreads, const T* top_diff,
-                                   const int in_height, const int in_width,
-                                   const int channels, const int out_height,
-                                   const int out_width, const float height_scale,
-                                   const float width_scale, T* bottom_diff) {
+    const int nthreads, const T* top_diff, const int in_height,
+    const int in_width, const int channels, const int out_height,
+    const int out_width, const float height_scale, const float width_scale,
+    T* bottom_diff) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -68,8 +77,14 @@ __global__ void ResizeNearestNeighborBackwardNHWC(
     n /= in_height;
 
     T* bottom_diff_n = bottom_diff + n * channels * out_height * out_width;
-    const int out_x = min(static_cast<int>(floorf(in_x * width_scale)), out_width - 1);
-    const int out_y = min(static_cast<int>(floorf(in_y * height_scale)), out_height - 1);
+    const int out_y =
+        min((align_corners) ? static_cast<int>(roundf(in_y * height_scale))
+                            : static_cast<int>(floorf(in_y * height_scale)),
+            out_height - 1);
+    const int out_x =
+        min((align_corners) ? static_cast<int>(roundf(in_x * width_scale))
+                            : static_cast<int>(floorf(in_x * width_scale)),
+            out_width - 1);
     const int idx = (out_y * out_width + out_x) * channels + c;
     CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index));
   }
@@ -77,69 +92,86 @@ __global__ void ResizeNearestNeighborBackwardNHWC(
 
 }  // namespace
 
-template <typename T>
-bool ResizeNearestNeighbor(const T* bottom_data, const int batch,
-                           const int in_height, const int in_width,
-                           const int channels, const int out_height,
-                           const int out_width,  const float height_scale,
-                           const float width_scale, T* top_data,
-                           const Eigen::GpuDevice& d) {
-  const int output_size = batch * channels * out_height * out_width;
-  CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
-
-  ResizeNearestNeighborNHWC<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      output_size, bottom_data, in_height, in_width, channels, out_height,
-      out_width, height_scale, width_scale, top_data);
-  return d.ok();
-}
+namespace functor {
+
+// Partial specialization of ResizeNearestNeighbor functor for a GPUDevice.
+template <typename T, bool align_corners>
+struct ResizeNearestNeighbor<GPUDevice, T, align_corners> {
+  bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    const int output_size = batch_size * out_height * out_width * channels;
+    if (output_size == 0) return true;
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
+    ResizeNearestNeighborNHWC<T, align_corners>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            output_size, input.data(), in_height, in_width, channels,
+            out_height, out_width, height_scale, width_scale, output.data());
+    return d.ok();
+  }
+};
 
-#define DECLARE_GPU_SPEC(T)                                                        \
-  template bool ResizeNearestNeighbor(const T* bottom_data, const int batch,       \
-                               const int in_height, const int in_width,            \
-                               const int channels, const int out_height,           \
-                               const int out_width,  const float height_scale,     \
-                               const float width_scale, T* top_data,               \
-                               const Eigen::GpuDevice& d);
+#define DECLARE_GPU_SPEC(T)                                   \
+  template struct ResizeNearestNeighbor<GPUDevice, T, false>; \
+  template struct ResizeNearestNeighbor<GPUDevice, T, true>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
-template <typename T>
-bool ResizeNearestNeighborBackward(const T* top_diff, const int batch,
-                                   const int in_height, const int in_width,
-                                   const int channels, const int out_height,
-                                   const int out_width,
-                                   const float height_scale,
-                                   const float width_scale, T* bottom_diff,
-                                   const Eigen::GpuDevice& d) {
-  const int output_size = batch * channels * out_height * out_width;
-  CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
-  SetZero<<<output_config.block_count,
-            output_config.thread_per_block, 0, d.stream()>>>(output_size, bottom_diff);
-
-  const int input_size = batch * channels * in_height * in_width;
-  CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
-  ResizeNearestNeighborBackwardNHWC<T><<<
-      input_config.block_count, input_config.thread_per_block, 0, d.stream()>>>(
-      input_config.virtual_thread_count, top_diff, in_height, in_width,
-      channels, out_height, out_width, height_scale, width_scale, bottom_diff);
-  return d.ok();
-}
+// Partial specialization of ResizeNearestNeighborGrad functor for a GPUDevice.
+template <typename T, bool align_corners>
+struct ResizeNearestNeighborGrad<GPUDevice, T, align_corners> {
+  bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    const int output_size = batch_size * channels * out_height * out_width;
+
+    CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
+    SetZero<<<output_config.block_count, output_config.thread_per_block, 0,
+              d.stream()>>>(output_size, output.data());
+    if (!d.ok()) return false;
+
+    const int input_size = batch_size * channels * in_height * in_width;
+    if (input_size == 0) return true;
+
+    CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
+    ResizeNearestNeighborBackwardNHWC<T, align_corners>
+        <<<input_config.block_count, input_config.thread_per_block, 0,
+           d.stream()>>>(input_config.virtual_thread_count, input.data(),
+                         in_height, in_width, channels, out_height, out_width,
+                         height_scale, width_scale, output.data());
+    return d.ok();
+  }
+};
 
-#define DECLARE_GPU_SPEC(T)                                                           \
-  template bool ResizeNearestNeighborBackward(const T* top_diff, const int batch,     \
-                               const int in_height, const int in_width,               \
-                               const int channels, const int out_height,              \
-                               const int out_width, const float height_scale,         \
-                               const float width_scale, T* bottom_diff,               \
-                               const Eigen::GpuDevice& d);
+#define DECLARE_GPU_SPEC(T)                                       \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, false>; \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, true>;
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPEC);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
-}  // end namespace tensorflow
+}  // namespace functor
+
+}  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h
deleted file mode 100644
index 0a8fd6e1665833837e1de0ce4245cc767b8c74c6..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
-#endif
-
-#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_GPU_H_
-#define TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_GPU_H_
-
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-template <typename T>
-bool ResizeNearestNeighbor(const T* bottom_data, const int batch, const int in_height,
-                           const int in_width, const int channels, const int out_height,
-                           const int out_width, const float height_scale, const float width_scale,
-                           T* top_data, const Eigen::GpuDevice& d);
-
-template <typename T>
-bool ResizeNearestNeighborBackward(const T* top_diff, const int batch, const int in_height,
-                                   const int in_width, const int channels, const int out_height,
-                                   const int out_width, const float height_scale, const float width_scale,
-                                   T* bottom_diff, const Eigen::GpuDevice& d);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_GPU_H_
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
index 34ebff6c680af99a4070c6f11a92421f5cd69e6d..ecf54c697735cc4c31da081b5a216dcb3fc8d7bf 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
@@ -124,9 +124,9 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
 
   // clang-format off
   test::FillValues<float>(&expected,
-    {1, 1, 2,
-     1, 1, 2,
-     3, 3, 4});
+    {1, 2, 2,
+     3, 4, 4,
+     3, 4, 4});
 
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
@@ -235,9 +235,9 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
 
   // clang-format off
   test::FillValues<float>(&expected,
-    { 1,  2,  4,
-      5,  6,  8,
-     13, 14, 16});
+    { 1,  3,  4,
+      9, 11, 12,
+     13, 15, 16});
 
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 24b3ba31b8aaa49d93fa7b8782a3bfd6a63331f7..6f7a0a4df511ede609a0291b1284c55c8bdd84f8 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -266,6 +266,7 @@ class ReverseV2Op : public OpKernel {
                               .HostMemory("axis"),           \
                           ReverseV2Op<CPUDevice, T>)
 TF_CALL_POD_TYPES(REGISTER_KERNELS);
+TF_CALL_string(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
index 19e25b887d7f9c63570f0086d328462c24f57480..c6193f378d21c513be94dc16e0e6b53ce3ac8483 100644
--- a/tensorflow/core/kernels/reverse_op_test.cc
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -120,7 +120,7 @@ static SessionOptions GetOptions(int intra_threads) {
 
 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
 // into a scalar.
-static Graph* Reverse(TensorShape shape, int reverse_axis) {
+static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, shape);
   data.flat<float>().setRandom();
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 006ef988b567f780a08bc6fbedc9ad1eaba1c441..80d490174064a366212ffe5a48681a2c48f5f42e 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include <utility>
 #include <vector>
 #include "tensorflow/core/kernels/save_restore_tensor.h"
 
@@ -79,7 +80,7 @@ void SaveTensors(
   VLOG(1) << "About to save tensors to file " << filename_t.flat<string>()(0)
           << "...";
   checkpoint::TensorSliceWriter writer(filename_t.flat<string>()(0),
-                                       builder_func);
+                                       std::move(builder_func));
 
   Status s;
   auto tensor_names_flat = tensor_names_t.flat<string>();
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
index 0a281835a4b9b76b91430efbf6303eaaec539897..bdf3c12ff92e4e60ec81fe5e6a2420f88559d952 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_test.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -40,8 +40,9 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename Index>
-static void BM_SegmentReduction(int iters, string reduction, Index num_rows,
-                                Index num_cols, Index segment_size) {
+static void BM_SegmentReduction(int iters, const string& reduction,
+                                Index num_rows, Index num_cols,
+                                Index segment_size) {
   testing::StopTiming();
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index 177a32464ba5d55f5a6536f11f8403a9a7c13fea..d78c6d26394bf2c6ac922b4bb58fde8340705333 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -82,6 +82,7 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                           ShapeOp<int64>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -131,6 +132,7 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                           ShapeNOp<int64>)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -277,6 +279,7 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                               .HostMemory("output"),             \
                           SizeOp<int64>);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -351,6 +354,7 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                               .HostMemory("dim"),            \
                           ExpandDimsOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -395,6 +399,7 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
       Name("Squeeze").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       SqueezeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
index 5650435781ae109d2ddc463b532f25fb205e53d7..494a83ed14e83f5fb2506774f1cbabfaf226bbed 100644
--- a/tensorflow/core/kernels/softplus_op.cc
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/warn_about_ints.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -33,7 +34,10 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename T>
 class SoftplusOp : public UnaryElementWiseOp<T, SoftplusOp<Device, T>> {
  public:
-  using UnaryElementWiseOp<T, SoftplusOp<Device, T>>::UnaryElementWiseOp;
+  explicit SoftplusOp(OpKernelConstruction* context)
+      : UnaryElementWiseOp<T, SoftplusOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::Softplus<Device, T> functor;
@@ -46,7 +50,10 @@ template <typename Device, typename T>
 class SoftplusGradOp
     : public BinaryElementWiseOp<T, SoftplusGradOp<Device, T>> {
  public:
-  using BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>::BinaryElementWiseOp;
+  explicit SoftplusGradOp(OpKernelConstruction* context)
+      : BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
                          const Tensor& a, Tensor* output);
diff --git a/tensorflow/core/kernels/softsign_op.cc b/tensorflow/core/kernels/softsign_op.cc
index 33b9628b32188fc43b9f38fa50668aa5f57cd7bc..00ee649b17552da97229926392a4ed4223378711 100644
--- a/tensorflow/core/kernels/softsign_op.cc
+++ b/tensorflow/core/kernels/softsign_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/warn_about_ints.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -33,7 +34,10 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename T>
 class SoftsignOp : public UnaryElementWiseOp<T, SoftsignOp<Device, T>> {
  public:
-  using UnaryElementWiseOp<T, SoftsignOp<Device, T>>::UnaryElementWiseOp;
+  explicit SoftsignOp(OpKernelConstruction* context)
+      : UnaryElementWiseOp<T, SoftsignOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::Softsign<Device, T> functor;
@@ -46,7 +50,10 @@ template <typename Device, typename T>
 class SoftsignGradOp
     : public BinaryElementWiseOp<T, SoftsignGradOp<Device, T>> {
  public:
-  using BinaryElementWiseOp<T, SoftsignGradOp<Device, T>>::BinaryElementWiseOp;
+  explicit SoftsignGradOp(OpKernelConstruction* context)
+      : BinaryElementWiseOp<T, SoftsignGradOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
                          const Tensor& a, Tensor* output);
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b4d5effdad925bf989a51968ee753f4413108cf
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -0,0 +1,572 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains OP to generate sparse crosses.
+#include <assert.h>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+namespace {
+// An interface that represents a column with batches.
+template <typename InternalType>
+class ColumnInterface {
+ public:
+  // Returns the number of features in the specified batch.
+  virtual int64 FeatureCount(int64 batch) const = 0;
+
+  // Returns the fingerprint of nth feature from the specified batch.
+  InternalType Feature(int64 batch, int64 n) const {
+    InternalType not_used = InternalType();
+    return DoFeature(batch, n, not_used);
+  }
+
+  virtual InternalType DoFeature(int64 batch, int64 n,
+                                 InternalType not_used) const = 0;
+
+  virtual ~ColumnInterface() {}
+};
+
+// A column that is backed by a sparse tensor.
+template <typename InternalType>
+class SparseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  SparseTensorColumn(const Tensor& values, std::vector<int64> feature_counts,
+                     std::vector<int64> feature_start_indices)
+      : values_(values),
+        feature_counts_(std::move(feature_counts)),
+        feature_start_indices_(std::move(feature_start_indices)) {
+    CHECK_EQ(feature_counts_.size(), feature_start_indices_.size());
+  }
+
+  int64 FeatureCount(int64 batch) const override {
+    return feature_counts_[batch];
+  }
+
+  // InternalType is int64 only when using HashCrosser.
+  int64 DoFeature(int64 batch, int64 n, int64 not_used) const {
+    const int64 start = feature_start_indices_[batch];
+    if (DT_STRING == values_.dtype())
+      return Fingerprint64(values_.vec<string>().data()[start + n]);
+    return values_.vec<int64>().data()[start + n];
+  }
+
+  // InternalType is string or StringPiece when using StringCrosser.
+  string DoFeature(int64 batch, int64 n, string not_used) const {
+    const int64 start = feature_start_indices_[batch];
+    if (DT_STRING == values_.dtype())
+      return values_.vec<string>().data()[start + n];
+    return std::to_string(values_.vec<int64>().data()[start + n]);
+  }
+
+  StringPiece DoFeature(int64 batch, int64 n, StringPiece not_used) const {
+    const int64 start = feature_start_indices_[batch];
+    return values_.vec<string>().data()[start + n];
+  }
+
+  ~SparseTensorColumn() override {}
+
+ private:
+  const Tensor& values_;
+  std::vector<int64> feature_counts_;
+  std::vector<int64> feature_start_indices_;
+};
+
+// A column that is backed by a dense tensor.
+template <typename InternalType>
+class DenseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  explicit DenseTensorColumn(const Tensor& tensor) : tensor_(tensor) {}
+
+  int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
+
+  // InternalType is int64 only when using HashCrosser.
+  int64 DoFeature(int64 batch, int64 n, int64 not_used) const {
+    if (DT_STRING == tensor_.dtype())
+      return Fingerprint64(tensor_.matrix<string>()(batch, n));
+    return tensor_.matrix<int64>()(batch, n);
+  }
+
+  // Internal type is string or StringPiece when using StringCrosser.
+  string DoFeature(int64 batch, int64 n, string not_used) const {
+    if (DT_STRING == tensor_.dtype()) return tensor_.matrix<string>()(batch, n);
+    return std::to_string(tensor_.matrix<int64>()(batch, n));
+  }
+
+  StringPiece DoFeature(int64 batch, int64 n, StringPiece not_used) const {
+    return tensor_.matrix<string>()(batch, n);
+  }
+
+  ~DenseTensorColumn() override {}
+
+ private:
+  const Tensor& tensor_;
+};
+
+// Updates Output tensors with sparse crosses.
+template <typename OutType>
+class OutputUpdater {
+ public:
+  OutputUpdater(const std::vector<int64>& output_start_indices,
+                Tensor* indices_out, Tensor* values_out)
+      : output_start_indices_(output_start_indices),
+        indices_out_(indices_out),
+        values_out_(values_out) {}
+
+  void Update(const int64 batch_index, const int64 cross_count,
+              const OutType& cross) const {
+    const int64 output_index = output_start_indices_[batch_index] + cross_count;
+
+    auto indices_matrix = indices_out_->matrix<int64>();
+    indices_matrix(output_index, 0) = batch_index;
+    indices_matrix(output_index, 1) = cross_count;
+
+    auto value_vec = values_out_->vec<OutType>();
+    value_vec(output_index) = cross;
+  }
+
+ private:
+  const std::vector<int64>& output_start_indices_;
+  Tensor* indices_out_;
+  Tensor* values_out_;
+};
+
+// Generates the sparse crosses as concatenation of strings.
+template <typename InternalType>
+class StringCrosser {
+ public:
+  StringCrosser(const std::vector<
+                    std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+                const int64 num_buckets_unused, const uint64 hash_key_unused)
+      : columns_(columns) {}
+
+  string Generate(const int64 batch_index,
+                  const std::vector<int>& permutation) const {
+    static const auto k_feature_separator = "_X_";
+
+    gtl::InlinedVector<InternalType, 6> cross_vec(columns_.size());
+    for (int i = 0; i < permutation.size(); i++) {
+      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
+    }
+    // TODO(zakaria): this will copy the string twice, might effect
+    // performance.
+    return str_util::Join(cross_vec, k_feature_separator);
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+};
+
+// Generates the sparse crosses as nested hash to avoid string manipulations.
+class HashCrosser {
+ public:
+  HashCrosser(
+      const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
+      const int64 num_buckets, const uint64 hash_key)
+      : columns_(columns), num_buckets_(num_buckets), hash_key_(hash_key) {}
+
+  int64 Generate(const int64 batch_index,
+                 const std::vector<int>& permutation) const {
+    // Do the fingerprint concatenation on uint64.
+    uint64 hashed_output = hash_key_;
+    for (size_t i = 0; i < permutation.size(); ++i) {
+      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i]);
+      hashed_output = FingerprintCat64(hashed_output, hash_i);
+    }
+    // The return value is int64 based on the number of buckets.
+    if (num_buckets_ > 0) {
+      return hashed_output % num_buckets_;
+    } else {
+      // To prevent negative output we take modulo to max int64.
+      return hashed_output % std::numeric_limits<int64>::max();
+    }
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns_;
+  const int64 num_buckets_;
+  const uint64 hash_key_;
+};
+
+// ProductIterator generates cartesian products based on indices.
+template <typename InternalType>
+class ProductIterator {
+ public:
+  explicit ProductIterator(
+      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
+          columns,
+      int64 batch_index)
+      : columns_(columns), batch_index_(batch_index) {
+    next_permutation_.resize(columns_.size(), 0);
+    // Sets has_next_ to false if any feature column has 0 features.
+    has_next_ = true;
+    for (int i = 0; i < columns_.size(); i++) {
+      if (columns_[i]->FeatureCount(batch_index_) == 0) {
+        has_next_ = false;
+        break;
+      }
+    }
+  }
+
+  std::vector<int> Next() {
+    std::vector<int> permutation(next_permutation_);
+
+    // Generates next permutation, if available.
+    bool carry = true;
+    for (int i = next_permutation_.size() - 1; i >= 0; i--) {
+      if (carry) {
+        next_permutation_[i] = next_permutation_[i] + 1;
+      }
+      if (next_permutation_[i] == columns_[i]->FeatureCount(batch_index_)) {
+        next_permutation_[i] = 0;
+      } else {
+        carry = false;
+        break;
+      }
+    }
+    has_next_ = !carry;
+    return permutation;
+  }
+
+  bool HasNext() { return has_next_; }
+
+ private:
+  bool has_next_;
+  const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+  const int64 batch_index_;
+  std::vector<int> next_permutation_;
+};
+
+template <bool HASHED_OUTPUT, typename InternalType>
+struct CrossTraits;
+
+template <typename InternalType>
+struct CrossTraits<false, InternalType> {
+  typedef StringCrosser<InternalType> Crosser;
+  typedef OutputUpdater<string> Updater;
+};
+
+template <>
+struct CrossTraits<true, int64> {
+  typedef HashCrosser Crosser;
+  typedef OutputUpdater<int64> Updater;
+};
+}  // namespace
+
+template <bool HASHED_OUTPUT, typename InternalType>
+class SparseCrossOp : public OpKernel {
+ public:
+  explicit SparseCrossOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
+    // Read signed_hash_key_ as int64 since uint64 attributes are not
+    // supported by REGISTER_OP.
+    int64 signed_hash_key_;
+    OP_REQUIRES_OK(context, context->GetAttr("hash_key", &signed_hash_key_));
+    hash_key_ = static_cast<uint64>(signed_hash_key_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    ValidateInput(context, indices_list_in, values_list_in, shapes_list_in,
+                  dense_list_in);
+
+    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
+        GenerateColumnsFromInput(indices_list_in, values_list_in,
+                                 shapes_list_in, dense_list_in);
+
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser
+        crosser(columns, num_buckets_, hash_key_);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out,
+                        &shape_out, &output_start_indices);
+
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater
+        updater(output_start_indices, indices_out, values_out);
+    auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<InternalType> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count, crosser.Generate(b, permutation));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+
+ private:
+  // Validates input tensors.
+  void ValidateInput(OpKernelContext* context,
+                     const OpInputList& indices_list_in,
+                     const OpInputList& values_list_in,
+                     const OpInputList& shapes_list_in,
+                     const OpInputList& dense_list_in) {
+    const auto size = indices_list_in.size();
+    // Validates indices_list_in OpInputList.
+    for (int i = 0; i < size; i++) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsMatrix(indices_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Input indices should be a matrix but received shape ",
+              indices_list_in[i].shape().DebugString(), " at position ", i));
+      OP_REQUIRES(
+          context, indices_list_in[i].shape().dim_size(1) == 2,
+          errors::InvalidArgument("Expected D2 of index to be 2 got ",
+                                  indices_list_in[i].shape().dim_size(1),
+                                  " at position ", i));
+    }
+
+    // Validates values_list_in OpInputList.
+    OP_REQUIRES(
+        context, values_list_in.size() == size,
+        errors::InvalidArgument("Expected ", size, " input values, got ",
+                                values_list_in.size()));
+    for (int i = 0; i < size; i++) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(values_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Input values should be a std::vector but received shape ",
+              values_list_in[i].shape().DebugString(), " at position ", i));
+      OP_REQUIRES(
+          context, indices_list_in[i].shape().dim_size(0) ==
+                       values_list_in[i].shape().dim_size(0),
+          errors::InvalidArgument(
+              "Expected size of values to be ",
+              indices_list_in[i].shape().dim_size(0), " got ",
+              values_list_in[i].shape().dim_size(0), " at position ", i));
+    }
+
+    // Validates shapes_list_in OpInputList
+    OP_REQUIRES(
+        context, shapes_list_in.size() == size,
+        errors::InvalidArgument("Expected ", size, " input shapes, got ",
+                                shapes_list_in.size()));
+    const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    for (int i = 0; i < size; i++) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(shapes_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Input shapes should be a std::vector but received shape ",
+              shapes_list_in[i].shape().DebugString(), " at position ", i));
+
+      OP_REQUIRES(
+          context, shapes_list_in[i].vec<int64>().size() == 2,
+          errors::InvalidArgument("shape should imply a 2D tensor, but got ",
+                                  shapes_list_in[i].shape().DebugString(),
+                                  " at position ", i));
+      OP_REQUIRES(context, shapes_list_in[i].vec<int64>()(0) == batch_size,
+                  errors::InvalidArgument(
+                      "Expected batch size ", batch_size, " got ",
+                      shapes_list_in[i].vec<int64>()(0), " at position ", i));
+    }
+
+    // Validates dense_list_in OpInputList
+    for (int i = 0; i < dense_list_in.size(); ++i) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsMatrix(dense_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Dense inputs should be a matrix but received shape ",
+              indices_list_in[i].shape().DebugString(), " at position ", i));
+      OP_REQUIRES(context, dense_list_in[i].dim_size(0) == batch_size,
+                  errors::InvalidArgument("Expected batch size ", batch_size,
+                                          " got ", dense_list_in[i].dim_size(0),
+                                          " at dense tensor ", i));
+    }
+  }
+
+  // Calculate the batch size from either the shapes input or the dense input.
+  int64 CalculateBatchSize(const OpInputList& shapes_list_in,
+                           const OpInputList& dense_list_in) {
+    if (shapes_list_in.size() > 0) {
+      return shapes_list_in[0].vec<int64>()(0);
+    }
+
+    if (dense_list_in.size() > 0) {
+      return dense_list_in[0].dim_size(0);
+    }
+
+    return 0;
+  }
+
+  // Generate the columns given the sparse and dense inputs.
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+  GenerateColumnsFromInput(const OpInputList& indices_list_in,
+                           const OpInputList& values_list_in,
+                           const OpInputList& shapes_list_in,
+                           const OpInputList& dense_list_in) {
+    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    const int64 number_of_columns = shapes_list_in.size();
+
+    std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                   std::vector<int64>());
+    std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                          std::vector<int64>());
+
+    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                       &feature_start_indices);
+
+    for (int i = 0; i < values_list_in.size(); ++i) {
+      columns.emplace_back(new SparseTensorColumn<InternalType>(
+          values_list_in[i], std::move(feature_counts[i]),
+          std::move(feature_start_indices[i])));
+    }
+    for (int i = 0; i < dense_list_in.size(); ++i) {
+      columns.emplace_back(
+          new DenseTensorColumn<InternalType>(dense_list_in[i]));
+    }
+
+    return columns;
+  }
+
+  // Extracts data about the features and populates feature data.
+  void ExtractFeatureData(
+      const OpInputList& indices_list_in, int64 batch_size,
+      std::vector<std::vector<int64>>* feature_counts,
+      std::vector<std::vector<int64>>* feature_start_indices) {
+    gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
+    for (int b = 0; b < batch_size; b++) {
+      for (int i = 0; i < indices_list_in.size(); i++) {
+        const auto indices = indices_list_in[i].matrix<int64>();
+        int64 feature_count = 0;
+        int64 start_index = current_row[i];
+        // Loops until we reach next batch index for current feature column.
+        while (current_row[i] < indices_list_in[i].dim_size(0) &&
+               indices(current_row[i], 0) == b) {
+          feature_count++;
+          current_row[i]++;
+        }
+        (*feature_counts)[i].push_back(feature_count);
+        (*feature_start_indices)[i].push_back(start_index);
+      }
+    }
+  }
+
+  // Allocates output tensors with proper size and sets the shape tensor of
+  // the output SparseTensor.
+  // It also output_start_indices which contains the start indices for each
+  // input in the output SparseTensor.
+  void CreateOutputTensors(
+      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
+          columns,
+      int64 batch_size, OpKernelContext* context, Tensor** indices_out,
+      Tensor** values_out, Tensor** shape_out,
+      std::vector<int64>* output_start_indices) {
+    // Calculates dimensions for output tensors.
+    int64 cross_count_total = 0;
+    int64 max_cross_count = 0;
+    for (int64 b = 0; b < batch_size; b++) {
+      // For each input, sets starting indices in output SparseTensor
+      (*output_start_indices)[b] = cross_count_total;
+      const auto cross_count = CrossCountByBatchIndex(columns, b);
+      max_cross_count = std::max(max_cross_count, cross_count);
+      cross_count_total += cross_count;
+    }
+
+    // Allocates tensors.
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({cross_count_total, 2}), indices_out));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, TensorShape({cross_count_total}),
+                                            values_out));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, TensorShape({2}), shape_out));
+
+    // Sets shape.
+    auto shape_vec = (*shape_out)->vec<int64>();
+    shape_vec(0) = batch_size;
+    shape_vec(1) = max_cross_count;
+  }
+
+  // Returns number of crosses for a given batch_index
+  int64 CrossCountByBatchIndex(
+      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
+          columns,
+      int batch_index) {
+    int64 cross_count = 1;
+    for (int i = 0; i < columns.size(); i++) {
+      const auto feature_count = columns[i]->FeatureCount(batch_index);
+      // If one column is missing any feature, there won't be any cross.
+      if (feature_count == 0) {
+        return 0;
+      }
+      cross_count *= feature_count;
+    }
+    return cross_count;
+  }
+  int64 num_buckets_;
+  uint64 hash_key_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("out_type")
+                            .TypeConstraint<string>("internal_type"),
+                        SparseCrossOp<false, StringPiece>);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("out_type")
+                            .TypeConstraint<int64>("internal_type"),
+                        SparseCrossOp<false, string>);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("out_type")
+                            .TypeConstraint<string>("internal_type"),
+                        SparseCrossOp<true, int64>);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("out_type")
+                            .TypeConstraint<int64>("internal_type"),
+                        SparseCrossOp<true, int64>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc
index b5093d59fc0e0f3a3246e18023d48aaefdcf38b4..48f38872e253ee969fb7a923812e6a6f0c15ce6e 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc
@@ -47,16 +47,26 @@ class SparseTensorDenseAddOp : public OpKernel {
                     "Input a_indices should be a matrix but received shape: ",
                     a_indices_t->shape().DebugString()));
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                 TensorShapeUtils::IsVector(a_shape_t->shape()),
+        ctx,
+        TensorShapeUtils::IsVector(a_values_t->shape()) &&
+            TensorShapeUtils::IsVector(a_shape_t->shape()),
         errors::InvalidArgument("Inputs a_values and a_shape should be vectors "
                                 "but received shapes: ",
                                 a_values_t->shape().DebugString(), " and ",
                                 a_shape_t->shape().DebugString()));
-    OP_REQUIRES(ctx, a_shape_t->NumElements() == b->dims(),
-                errors::InvalidArgument(
-                    "Two operands have different dimensions; received: ",
-                    a_shape_t->NumElements(), " and ", b->dims()));
+    OP_REQUIRES(
+        ctx, a_shape_t->NumElements() == b->dims(),
+        errors::InvalidArgument("Two operands have different ranks; received: ",
+                                a_shape_t->NumElements(), " and ", b->dims()));
+    const auto a_shape_flat = a_shape_t->flat<Index>();
+    for (int i = 0; i < b->dims(); ++i) {
+      OP_REQUIRES(
+          ctx, a_shape_flat(i) == b->dim_size(i),
+          errors::InvalidArgument(
+              "Dimension ", i,
+              " does not equal (no broadcasting is supported): sparse side ",
+              a_shape_flat(i), " vs dense side ", b->dim_size(i)));
+    }
 
     Tensor *out_t;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, b->shape(), &out_t));
@@ -82,8 +92,9 @@ class SparseTensorDenseAddOp : public OpKernel {
       NDIMS_CASE(4);
       NDIMS_CASE(5);
       default:
-        OP_REQUIRES(ctx, false, errors::InvalidArgument(
-                                    "Only tensors with ranks between 1 and 5 "
+        OP_REQUIRES(
+            ctx, false,
+            errors::InvalidArgument("Only tensors with ranks between 1 and 5 "
                                     "are currently supported.  Tensor rank: ",
                                     ndims));
 #undef NDIMS_CASE
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 30026f222a6a1dd3ab23f2d4bcfda5942fb57d5d..30c57ef287f4c645b198da6ebf6b8554dde4fd12 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -65,7 +65,8 @@ class SparseTensorDenseMatMulOp : public OpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()),
                 errors::InvalidArgument("Tensor 'a_indices' is not a matrix"));
 
-    OP_REQUIRES(ctx, a_indices->shape().dim_size(0) == a_values->NumElements(),
+    const int64 nnz = a_indices->shape().dim_size(0);
+    OP_REQUIRES(ctx, nnz == a_values->NumElements(),
                 errors::InvalidArgument("Number of rows of a_indices does not "
                                         "match number of entries in a_values"));
 
@@ -89,8 +90,28 @@ class SparseTensorDenseMatMulOp : public OpKernel {
             inner_left, " vs. ", inner_right,
             ".  Did you forget a transpose?  "
             "Dimensions of A: [",
-            a_shape_t(0), ", ", a_shape_t(1), ").  Dimensions of B: ",
-            b->shape().DebugString()));
+            a_shape_t(0), ", ", a_shape_t(1),
+            ").  Dimensions of B: ", b->shape().DebugString()));
+
+    if (std::is_same<Device, GPUDevice>::value) {
+      // The GPU implementation is optimized to use 32 bit indexing, so
+      // give a friendly error to the programmer early on if they
+      // exceed.
+      const int int32max = std::numeric_limits<int>::max();
+      OP_REQUIRES(
+          ctx,
+          (FastBoundsCheck(inner_left, int32max) &&
+           FastBoundsCheck(inner_right, int32max) &&
+           FastBoundsCheck(outer_left, int32max) &&
+           FastBoundsCheck(outer_right, int32max) &&
+           FastBoundsCheck(b->NumElements(), int32max) &&
+           FastBoundsCheck(outer_left * outer_right, int32max) &&
+           FastBoundsCheck(a_values->NumElements(), int32max)),
+          errors::InvalidArgument("Cannot use GPU for > 2^31 entry inputs"));
+      OP_REQUIRES(ctx, FastBoundsCheck(nnz * outer_right, int32max),
+                  errors::InvalidArgument(
+                      "Cannot use GPU when output.shape[1] * nnz(a) > 2^31"));
+    }
 
     TensorShape out_shape({outer_left, outer_right});
     Tensor* out = nullptr;
@@ -111,41 +132,13 @@ class SparseTensorDenseMatMulOp : public OpKernel {
       return;
     }
 
-    Tensor scratch;
-
-    if (std::is_same<Device, GPUDevice>::value) {
-      // The GPU implementation is optimized to use 32 bit indexing, so
-      // give a friendly error to the programmer early on if they exceed.
-      OP_REQUIRES(
-          ctx,
-          FastBoundsCheck(inner_left, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(inner_right, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(outer_left, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(outer_right, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(b->NumElements(),
-                              std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(out->NumElements(),
-                              std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(a_values->NumElements(),
-                              std::numeric_limits<int>::max()),
-          errors::InvalidArgument("Cannot use GPU for > 2^31 entry inputs"));
-      const int nnz = static_cast<const int>(a_values->NumElements());
-      // Need nnz length vec scratch space on the GPU.
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape({nnz}), &scratch));
-    } else {
-      // We don't need scratch space on the CPU.
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape({0}), &scratch));
-    }
-
 #define MAYBE_ADJOINT(ADJ_A, ADJ_B)                                        \
   if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) {                        \
     Status functor_status = functor::SparseTensorDenseMatMulFunctor<       \
         Device, T, Tindices, ADJ_A,                                        \
         ADJ_B>::Compute(ctx->eigen_device<Device>(), out->matrix<T>(),     \
                         a_indices->matrix<Tindices>(), a_values->vec<T>(), \
-                        b->matrix<T>(), scratch.vec<T>());                 \
+                        b->matrix<T>());                                   \
     OP_REQUIRES_OK(ctx, functor_status);                                   \
   }
 
@@ -189,10 +182,9 @@ namespace functor {
   Status SparseTensorDenseMatMulFunctor<                                  \
       GPUDevice, T, Tindices, ADJ_A,                                      \
       ADJ_B>::Compute(const GPUDevice& d, typename TTypes<T>::Matrix out, \
-                      typename TTypes<Tindices>::ConstMatrix a_indices,   \
+                      TTypes<Tindices>::ConstMatrix a_indices,            \
                       typename TTypes<T>::ConstVec a_values,              \
-                      typename TTypes<T>::ConstMatrix b,                  \
-                      typename TTypes<T>::Vec scratch);                   \
+                      typename TTypes<T>::ConstMatrix b);                 \
   extern template struct SparseTensorDenseMatMulFunctor<                  \
       GPUDevice, T, Tindices, ADJ_A, ADJ_B>;
 
@@ -255,8 +247,7 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, Tindices, ADJ_A, ADJ_B> {
   static Status Compute(const CPUDevice& d, typename TTypes<T>::Matrix out,
                         typename TTypes<Tindices>::ConstMatrix a_indices,
                         typename TTypes<T>::ConstVec a_values,
-                        typename TTypes<T>::ConstMatrix b,
-                        typename TTypes<T>::Vec scratch) {
+                        typename TTypes<T>::ConstMatrix b) {
     const std::size_t nnz = a_values.size();
     const std::size_t rhs_right = (ADJ_B ? b.dimension(0) : b.dimension(1));
     const std::size_t lhs_right = (ADJ_B ? b.dimension(1) : b.dimension(0));
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
index e707743f782c8a00afe2611d8078a3438d24b6af..da131904949763c4b3414f391b57d5d7eaa38bed 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
@@ -28,11 +28,10 @@ namespace functor {
 template <typename Device, typename T, typename Tindices, bool ADJ_A,
           bool ADJ_B>
 struct SparseTensorDenseMatMulFunctor {
-  static EIGEN_ALWAYS_INLINE Status
-  Compute(const Device& d, typename TTypes<T>::Matrix out,
-          typename TTypes<Tindices>::ConstMatrix a_indices,
-          typename TTypes<T>::ConstVec a_values,
-          typename TTypes<T>::ConstMatrix b, typename TTypes<T>::Vec scratch);
+  static EIGEN_ALWAYS_INLINE Status Compute(
+      const Device& d, typename TTypes<T>::Matrix out,
+      typename TTypes<Tindices>::ConstMatrix a_indices,
+      typename TTypes<T>::ConstVec a_values, typename TTypes<T>::ConstMatrix b);
 };
 
 template <typename MATRIX, bool ADJ>
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
index 7266e0cf8123cb6ddd7dbcbeeae17c73fb54b477..e261e42e0d3bf43efc3a1328f07b1362f0870dfd 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -20,71 +20,45 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-namespace generator {
-
 template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
-class SparseTensorDenseMatMulGPUGenerator {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SparseTensorDenseMatMulGPUGenerator(
-      typename TTypes<T, 2>::Tensor32Bit out,
-      typename TTypes<const Tindices, 2>::Tensor32Bit a_indices,
-      typename TTypes<const T, 1>::Tensor32Bit a_values,
-      typename TTypes<const T, 2>::Tensor32Bit b)
-      : out_(out),
-        lhs_index_a_(ADJ_A ? 1 : 0),
-        rhs_index_a_(ADJ_A ? 0 : 1),
-        a_indices_(a_indices),
-        a_values_(a_values),
-        lhs_right_size(ADJ_B ? b.dimension(1) : b.dimension(0)),
-        maybe_adjoint_b_(
-            functor::MaybeAdjoint<typename TTypes<const T, 2>::Tensor32Bit,
-                                  ADJ_B>(b)) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
-  operator()(const Eigen::array<int, 2>& j_and_ix) const {
-#ifdef __CUDA_ARCH__
-    const int j = j_and_ix[0];
-    const int ix = j_and_ix[1];
-    int m = a_indices_(ix, lhs_index_a_);
-    int k = a_indices_(ix, rhs_index_a_);
-    assert(k < lhs_right_size);
-    assert(m < out_.dimension(0));
-    // If asserts are disabled, the caller is violating the sparse
-    // tensor index contract, and so we return invalid results.
-    // Force returning NaNs to try to signal that something is amiss.
-    T b_value;
-    if (k >= lhs_right_size || m >= out_.dimension(0)) {
-      m = 0;
-      k = 0;
-      b_value = std::numeric_limits<T>::quiet_NaN();
-    } else {
-      b_value = maybe_adjoint_b_(k, j);
+__global__ void SparseTensorDenseMatMulKernel(int nnz, int m, int b_rows,
+                                              int b_cols, int p,
+                                              const Tindices* a_indices,
+                                              const T* a_values, const T* b,
+                                              T* out) {
+  // out_{ij} = sum_k {a_ik b_kj}
+  // out = A * B', out_{ij} = sum_k {a_ik (b')_kj}; b'_{kj} = b_{jk}
+  const int n = (ADJ_B) ? b_cols : b_rows;
+  CUDA_1D_KERNEL_LOOP(index, nnz * p) {
+    const int a_ix = index / p;
+    const int j = index % p;
+    const int i = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 1 : 0));
+    const int k = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 0 : 1));
+    if (!FastBoundsCheck(i, m)) {
+      continue;  // Nowhere to signal an error :(
+    }
+    // out[i, j]
+    T* out_location = out + i * p + j;
+    if (!FastBoundsCheck(k, n)) {
+      CudaAtomicAdd(out_location, std::numeric_limits<T>::quiet_NaN());
+      continue;
     }
-    atomicAdd(&out_(m, j), a_values_(ix) * b_value);
-#else
-    assert(false && "This should only be run on the device");
-#endif
-    // Return something
-    return T(0);
-  }
 
- private:
-  mutable typename TTypes<T, 2>::Tensor32Bit out_;
-  const int lhs_index_a_;
-  const int rhs_index_a_;
-  typename TTypes<const Tindices, 2>::Tensor32Bit a_indices_;
-  typename TTypes<const T, 1>::Tensor32Bit a_values_;
-  const int lhs_right_size;
-  functor::MaybeAdjoint<typename TTypes<const T, 2>::Tensor32Bit, ADJ_B>
-      maybe_adjoint_b_;
-};
+    // a_value == (ADJ_A) ? a[k, i] : a[i, k]
+    const T a_value = ldg(a_values + a_ix);
 
-}  // namespace generator
+    // b_value == (ADJ_B) ? b[j, k] : b[k, j]
+    const T b_value = ldg(b + ((ADJ_B) ? j * b_cols + k : k * b_cols + j));
+    CudaAtomicAdd(out_location, a_value * b_value);
+  }
+}
 
 namespace functor {
 
@@ -94,51 +68,23 @@ struct SparseTensorDenseMatMulFunctor<GPUDevice, T, Tindices, ADJ_A, ADJ_B> {
   Compute(const GPUDevice& d, typename TTypes<T>::Matrix out,
           typename TTypes<Tindices>::ConstMatrix a_indices,
           typename TTypes<T>::ConstVec a_values,
-          typename TTypes<T>::ConstMatrix b, typename TTypes<T>::Vec scratch) {
-    generator::SparseTensorDenseMatMulGPUGenerator<T, Tindices, ADJ_A, ADJ_B>
-        sparse_tensor_dense_matmul_generator(To32Bit(out), To32Bit(a_indices),
-                                             To32Bit(a_values), To32Bit(b));
-    To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+          typename TTypes<T>::ConstMatrix b) {
+    out.device(d) = out.constant(T(0));
     int nnz = a_values.size();
-    int n = (ADJ_B) ? b.dimension(0) : b.dimension(1);
-
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::Tensor<int, 2>::Dimensions matrix_1_by_nnz{{ 1, nnz }};
-    Eigen::array<int, 2> n_by_1{{ n, 1 }};
-    Eigen::array<int, 1> reduce_on_rows{{ 0 }};
-#else
-    Eigen::IndexList<Eigen::type2index<1>, int> matrix_1_by_nnz;
-    matrix_1_by_nnz.set(1, nnz);
-    Eigen::IndexList<int, Eigen::type2index<1> > n_by_1;
-    n_by_1.set(0, n);
-    Eigen::IndexList<Eigen::type2index<0> > reduce_on_rows;
-#endif
-
-    // How this works: the generator iterates over (j, ix) where j
-    // iterates from 0 .. n - 1 and ix iterates from
-    // 0 .. nnz - 1.  A side effect of the generator is to accumulate
-    // the products of values in A and B into the appropriate location
-    // in the dense matrix out.  In order to run the iteration,
-    // we take a smaller variable and broadcast to a size (n, nnz).
-    // This is the scratch variable.  In order to enforce execution,
-    // we have to perform assignment back into scratch (taking the sum).
-    // We don't care what gets assigned to scratch - only the side effect
-    // of the execution in the generator.
-    //
-    // Note it's not sufficient that scratch be a scalar, and to
-    // broadcast it to a matrix.  Eigen splits the computation not
-    // based on the largest intermediate shape (the size of the
-    // broadcast of scratch) but based on the output shape.  So
-    // scratch needs to be a vector at least.
-    //
-    // Note also that only float type is supported because the
-    // atomicAdd operation is only supported for floats in hardware.
-    To32Bit(scratch).device(d) =
-        To32Bit(scratch)
-            .reshape(matrix_1_by_nnz)
-            .broadcast(n_by_1)
-            .generate(sparse_tensor_dense_matmul_generator)
-            .sum(reduce_on_rows);
+    // out = A * B, A is [m x n] and B is [n x p], out is [m x p]
+    int m = out.dimension(0);
+    int p = out.dimension(1);
+    int b_rows = b.dimension(0);
+    int b_cols = b.dimension(1);
+
+    // TODO(ebrevdo): Should this be alpha * nnz instead of
+    // out.size()?  Perhaps p * nnz ?
+    CudaLaunchConfig config = GetCudaLaunchConfig(p * nnz, d);
+
+    SparseTensorDenseMatMulKernel<T, Tindices, ADJ_A, ADJ_B>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            nnz, m, b_rows, b_cols, p, a_indices.data(), a_values.data(),
+            b.data(), out.data());
 
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 114b41ae4238c6524498a2e8c23d2a257e92f8d0..4dff1ea046b88734e3383fa2d20091b832b5800e 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -87,6 +87,12 @@ class SplitVOpBase : public OpKernel {
     // Special case 1: num_split == 1. Nothing to do.
     if (num_split == 1) {
       context->set_output(0, context->input(0));
+      OP_REQUIRES(
+          context, (*split_sizes_vec)[0] == input_size_split_dim,
+          errors::InvalidArgument("If there is only one output, it must have "
+                                  "the same size as the input. Input size: ",
+                                  input_size_split_dim,
+                                  " output size: ", (*split_sizes_vec)[0]));
       *done = true;
       return;
     }
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 24884578ea8af63e27a999d67da32dadadcbd573..20a6adc493af2880d3f9430ca1c09f01c8a2b0d0 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/prefetch.h"
@@ -294,8 +295,16 @@ class StridedSliceAssignOp : public OpKernel {
     gtl::InlinedVector<int64, 4> end;
     gtl::InlinedVector<int64, 4> strides;
 
-    context->forward_ref_input_to_ref_output(0, 0);
-    Tensor old_lhs = context->mutable_input(0, true);
+    Tensor old_lhs;
+    if (context->input_dtype(0) == DT_RESOURCE) {
+      Var* v;
+      OP_REQUIRES_OK(context,
+                     LookupResource(context, HandleFromInput(context, 0), &v));
+      old_lhs = *v->tensor();
+    } else {
+      context->forward_ref_input_to_ref_output(0, 0);
+      old_lhs = context->mutable_input(0, true);
+    }
 
     ShapeReadWriteFromTensorShape wrapped_processing_shape(&processing_shape);
     ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
@@ -354,28 +363,35 @@ class StridedSliceAssignOp : public OpKernel {
   int32 ellipsis_mask, new_axis_mask, shrink_axis_mask;
 };
 
-#define REGISTER_STRIDED_SLICE(type)                           \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                 \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides"),          \
-                          StridedSliceOp<CPUDevice, type>)     \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("shape")             \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides"),          \
-                          StridedSliceGradOp<CPUDevice, type>) \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides"),          \
+#define REGISTER_STRIDED_SLICE(type)                             \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                   \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
+                          StridedSliceOp<CPUDevice, type>)       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")               \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("shape")               \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
+                          StridedSliceGradOp<CPUDevice, type>)   \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")             \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
+                          StridedSliceAssignOp<CPUDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")     \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
                           StridedSliceAssignOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
@@ -385,31 +401,39 @@ REGISTER_STRIDED_SLICE(bfloat16);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                                     \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                 \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceOp<GPUDevice, type>)     \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("shape")             \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceGradOp<GPUDevice, type>) \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
+#define REGISTER_GPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                   \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
+                          StridedSliceOp<GPUDevice, type>)       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")               \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("shape")               \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
+                          StridedSliceGradOp<GPUDevice, type>)   \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")             \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
+                          StridedSliceAssignOp<GPUDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")     \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
                           StridedSliceAssignOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
@@ -449,36 +473,53 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
                             .HostMemory("end")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                    \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                 \
-                              .Device(DEVICE_SYCL)             \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceOp<SYCLDevice, type>)    \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
-                              .Device(DEVICE_SYCL)             \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("shape")             \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceGradOp<SYCLDevice, type>)\
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
-                              .Device(DEVICE_SYCL)             \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
+#define REGISTER_SYCL(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                    \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceOp<SYCLDevice, type>)       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")                \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("shape")                \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceGradOp<SYCLDevice, type>)   \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")              \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceAssignOp<SYCLDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")      \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
                           StridedSliceAssignOp<SYCLDevice, type>)
 
 REGISTER_SYCL(float);
@@ -517,6 +558,15 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
                             .HostMemory("end")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_SYCL
 #endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_op_helpers.cc b/tensorflow/core/kernels/training_op_helpers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11d51188fcc21d5ba23f1583bee452b6ed22babe
--- /dev/null
+++ b/tensorflow/core/kernels/training_op_helpers.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+
+namespace tensorflow {
+
+mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input) {
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    Var* var;
+    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
+      return var->mu();
+    } else {
+      ctx->CtxFailureWithWarning(
+          errors::Internal("Invalid variable reference."));
+      return nullptr;
+    }
+  }
+  return ctx->input_ref_mutex(input);
+}
+
+// MaybeLockVariableInputMutexesInOrder is a helper function to acquire mutexes
+// in address order to mitigate deadlock.  Returns a vector of acquired mutexes.
+// Safe to pass duplicates - will only lock each distinct mutex once.  If
+// do_lock is false, returns immediately.  Note that this silently doesn't lock
+// mutexes for invalid variable references; in all usages this is followed by
+// GetInputTensor which will signal a failure.
+std::vector<mutex_lock> MaybeLockVariableInputMutexesInOrder(
+    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
+  std::vector<mutex_lock> locks;
+  if (!do_lock) {
+    return locks;
+  }
+  std::vector<mutex*> mutexes;
+  std::vector<int> acquire_order;
+  for (auto input : input_ids) {
+    mutex* mutex = GetTrainingVariableMutex(ctx, input);
+    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
+    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
+      acquire_order.push_back(mutexes.size());
+      mutexes.push_back(mutex);
+    }
+  }
+  std::sort(acquire_order.begin(), acquire_order.end(),
+            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
+
+  for (auto input : acquire_order) {
+    mutex* mu = GetTrainingVariableMutex(ctx, input);
+    if (mu != nullptr) {
+      locks.emplace_back(*mu);
+    }
+  }
+  return locks;
+}
+
+Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
+                                  bool lock_held, Tensor* out) {
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    Var* var;
+    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
+      if (lock_held) {
+        *out = *var->tensor();
+      } else {
+        mutex_lock ml(*var->mu());
+        *out = *var->tensor();
+      }
+      return Status::OK();
+    } else {
+      return errors::Internal("Invalid variable reference.");
+    }
+  }
+  *out = ctx->mutable_input(input, lock_held);
+  return Status::OK();
+}
+
+void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
+                                     int output) {
+  if (ctx->input_dtype(input) != DT_RESOURCE) {
+    ctx->forward_ref_input_to_ref_output(input, output);
+  }
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2577d452fa7b008bea04ea599ac269094dbfa00
--- /dev/null
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
+#define TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input);
+
+std::vector<mutex_lock> MaybeLockVariableInputMutexesInOrder(
+    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids);
+
+Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
+                                  bool lock_held, Tensor* out);
+
+void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
+                                     int output);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index bcdd4d0b8a531dfc5119a50f6032c67b48f04c02..606d7fd57f5863b331ce94da6c71800dba7aae55 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 namespace tensorflow {
@@ -256,12 +257,22 @@ struct ApplyAdamNonCuda {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad) {
+                  typename TTypes<T>::ConstFlat grad,
+                  bool use_nesterov) {
     const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
                     (T(1) - beta1_power());
+    // beta1 == μ
+    // beta2 == ν
+    // v     == n
+    // var   == θ
+
     m.device(d) += (grad - m) * (T(1) - beta1());
     v.device(d) += (grad.square() - v) * (T(1) - beta2());
-    var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
+    if (use_nesterov) {
+      var.device(d) -= ((grad * (T(1) - beta1()) + beta1() * m) * alpha) / (v.sqrt() + epsilon());
+    } else {
+      var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
+    }
   }
 };
 
@@ -306,80 +317,6 @@ struct ApplyCenteredRMSProp<CPUDevice, T> {
 
 }  // namespace functor
 
-mutex* GetMutex(OpKernelContext* ctx, int input) {
-  if (ctx->input_dtype(input) == DT_RESOURCE) {
-    Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      return var->mu();
-    } else {
-      ctx->CtxFailureWithWarning(
-          errors::Internal("Invalid variable reference."));
-      return nullptr;
-    }
-  }
-  return ctx->input_ref_mutex(input);
-}
-
-// MaybeLockMutexesInOrder is a helper function to acquire mutexes in address
-// order to mitigate deadlock.  Returns a vector of acquired mutexes.  Safe to
-// pass duplicates - will only lock each distinct mutex once.  If do_lock is
-// false, returns immediately.  Note that this silently doesn't lock mutexes for
-// invalid variable references; in all usages this is followed by GetInputTensor
-// which will signal a failure.
-std::vector<mutex_lock> MaybeLockMutexesInOrder(
-    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
-  std::vector<mutex_lock> locks;
-  if (!do_lock) {
-    return locks;
-  }
-  std::vector<mutex*> mutexes;
-  std::vector<int> acquire_order;
-  for (auto input : input_ids) {
-    mutex* mutex = GetMutex(ctx, input);
-    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
-    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
-      acquire_order.push_back(input);
-      mutexes.push_back(mutex);
-    }
-  }
-  std::sort(acquire_order.begin(), acquire_order.end(),
-            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
-
-  for (auto input : acquire_order) {
-    mutex* mu = GetMutex(ctx, input);
-    if (mu != nullptr) {
-      locks.emplace_back(*mu);
-    }
-  }
-  return locks;
-}
-
-Status GetInputTensor(OpKernelContext* ctx, int input, bool lock_held,
-                      Tensor* out) {
-  if (ctx->input_dtype(input) == DT_RESOURCE) {
-    Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      if (lock_held) {
-        *out = *var->tensor();
-      } else {
-        mutex_lock ml(*var->mu());
-        *out = *var->tensor();
-      }
-      return Status::OK();
-    } else {
-      return errors::Internal("Invalid variable reference.");
-    }
-  }
-  *out = ctx->mutable_input(input, lock_held);
-  return Status::OK();
-}
-
-void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
-                                     int output) {
-  if (ctx->input_dtype(input) != DT_RESOURCE) {
-    ctx->forward_ref_input_to_ref_output(input, output);
-  }
-}
 
 template <typename Device, typename T>
 class ApplyGradientDescentOp : public OpKernel {
@@ -389,9 +326,11 @@ class ApplyGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -538,7 +477,7 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     if (use_exclusive_lock_) {
-      mutex_lock l1(*GetMutex(ctx, 0));
+      mutex_lock l1(*GetTrainingVariableMutex(ctx, 0));
       // Don't try to acquire a lock on the second ref as they share the same
       // mutex.
       //
@@ -559,12 +498,14 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void DoValidate(OpKernelContext* ctx) {
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &accum_update));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -611,12 +552,14 @@ class ApplyAdadeltaOp : public OpKernel {
   void DoCompute(OpKernelContext* ctx) {
     const Device& device = ctx->template eigen_device<Device>();
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &accum_update));
 
     const Tensor& lr = ctx->input(3);
     const Tensor& rho = ctx->input(4);
@@ -683,7 +626,7 @@ class SparseApplyAdadeltaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    mutex* mu_var = GetMutex(ctx, 0);
+    mutex* mu_var = GetTrainingVariableMutex(ctx, 0);
     // mu_accum is actually the same mutex as mu_var since currently we use a
     // global mutex.
     //
@@ -692,13 +635,14 @@ class SparseApplyAdadeltaOp : public OpKernel {
       mu_var->lock();
     }
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum_grad;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 1, use_exclusive_lock_, &accum_grad));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_,
+                                                   &accum_grad));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &accum_update));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -833,9 +777,11 @@ class ApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -900,9 +846,11 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
                 errors::InvalidArgument("var must be at least 1 dimensional"));
 
@@ -1042,11 +990,14 @@ class ApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1132,11 +1083,14 @@ class ApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1236,11 +1190,14 @@ class SparseApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1367,11 +1324,14 @@ class SparseApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1536,15 +1496,17 @@ class ApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
-    Tensor gradient_accum;
     OP_REQUIRES_OK(
-        ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum));
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    Tensor gradient_accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_,
+                                                   &gradient_accum));
     Tensor gradient_squared_accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_,
-                                       &gradient_squared_accum));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1636,15 +1598,17 @@ class SparseApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
-    Tensor gradient_accum;
     OP_REQUIRES_OK(
-        ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum));
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    Tensor gradient_accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_,
+                                                   &gradient_accum));
     Tensor gradient_squared_accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_,
-                                       &gradient_squared_accum));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1830,14 +1794,18 @@ class ApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor linear;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1941,13 +1909,17 @@ class SparseApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor linear;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2147,12 +2119,15 @@ class ApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2247,12 +2222,15 @@ class SparseApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2357,17 +2335,22 @@ class ApplyAdamOp : public OpKernel {
  public:
   explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor m;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &m));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &m));
     Tensor v;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &v));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2427,13 +2410,15 @@ class ApplyAdamOp : public OpKernel {
                                     v.flat<T>(), beta1_power.scalar<T>(),
                                     beta2_power.scalar<T>(), lr.scalar<T>(),
                                     beta1.scalar<T>(), beta2.scalar<T>(),
-                                    epsilon.scalar<T>(), grad.flat<T>());
+                                    epsilon.scalar<T>(), grad.flat<T>(),
+                                    use_nesterov_);
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
+  bool use_nesterov_;
 };
 
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -2477,7 +2462,8 @@ namespace functor {
       typename TTypes<T>::ConstScalar beta1,                  \
       typename TTypes<T>::ConstScalar beta2,                  \
       typename TTypes<T>::ConstScalar epsilon,                \
-      typename TTypes<T>::ConstFlat grad);                    \
+      typename TTypes<T>::ConstFlat grad,                     \
+      bool use_nesterov);                                     \
   extern template struct ApplyAdam<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -2500,14 +2486,18 @@ class ApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2578,17 +2568,21 @@ class ApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2, 3});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor mg;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &mg));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 3, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2735,14 +2729,18 @@ class SparseApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2860,17 +2858,21 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2, 3});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor mg;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &mg));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 3, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index b315463dc360c74e7c346d98ae0268cb4afc455c..0346016849a5478a9bcd457cfbcd808258a2ebee 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -132,7 +132,8 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad);
+                  typename TTypes<T>::ConstFlat grad,
+                  bool use_nesterov);
 };
 
 template <typename Device, typename T>
@@ -157,7 +158,6 @@ struct ApplyCenteredRMSProp {
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad);
 };
-
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index f6acdf2422c434432b66086b4fa50a9c8803fe94..c2563c3a49251716e6c82a0c72398232a2cfade0 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -109,7 +109,8 @@ struct ApplyAdam<GPUDevice, T> {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad) {
+                  typename TTypes<T>::ConstFlat grad,
+                  bool use_nesterov) {
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
@@ -122,11 +123,26 @@ struct ApplyAdam<GPUDevice, T> {
         v +
         (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
             (grad.square() - v);
-    var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
-                      (beta1_power.constant(one) - beta1_power))
-                         .reshape(single)
-                         .broadcast(bcast) *
-                     m / (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+
+    if (use_nesterov) {
+      var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+                        (beta1_power.constant(one) - beta1_power))
+                           .reshape(single)
+                           .broadcast(bcast) *
+                       (m * beta1.reshape(single).broadcast(bcast) +
+                        (beta1.constant(one) - beta1)
+                           .reshape(single)
+                           .broadcast(bcast) *
+                        grad) / (epsilon
+                           .reshape(single)
+                           .broadcast(bcast) + v.sqrt());
+    } else {
+      var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+                        (beta1_power.constant(one) - beta1_power))
+                           .reshape(single)
+                           .broadcast(bcast) *
+                       m / (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index f5d4fcec84ce55a80ed3214fe797f451725932cd..d50e2060acfe64ea09980d0b27639cd6daeb421d 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unordered_map>
 #include <utility>
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -21,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace tensorflow {
 
@@ -50,8 +50,7 @@ class UniqueOp : public OpKernel {
                                 {0}, 1, input.shape(), &idx));
     auto idx_vec = idx->template vec<int32>();
 
-    std::unordered_map<T, int32> uniq;
-    uniq.reserve(2 * N);
+    gtl::FlatMap<T, int32> uniq(N);
     for (int64 i = 0, j = 0; i < N; ++i) {
       auto it = uniq.insert(std::make_pair(Tin(i), j));
       idx_vec(i) = it.first->second;
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 8c173a4ba30cdb1f1c50d2737f64a8c2a115811e..f0b5796d04a74c6fd39bca64cac2e603a19d46ba 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -76,6 +76,18 @@ class VariableOp : public OpKernel {
     // As long as the resource manager hasn't been cleared the ref we return
     // here is valid because it owns a ref on var.
     ctx->set_output_ref(0, var->mu(), var->tensor());
+    if (ctx->track_allocations() && var->tensor()->IsInitialized()) {
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      if (ctx->allocate_on_host(attr)) {
+        ctx->record_host_persistent_memory_allocation(
+            var->tensor()->AllocatedBytes());
+      } else {
+        ctx->record_device_persistent_memory_allocation(
+            var->tensor()->AllocatedBytes());
+      }
+    }
     var->Unref();
   }
 
@@ -115,6 +127,16 @@ class TemporaryVariableOp : public OpKernel {
     OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
                                        var_name_, tmp_var));
     context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
+    if (context->track_allocations()) {
+      AllocatorAttributes attr;
+      if (context->allocate_on_host(attr)) {
+        context->record_host_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      } else {
+        context->record_device_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      }
+    }
   }
 
  private:
diff --git a/tensorflow/core/kernels/warn_about_ints.cc b/tensorflow/core/kernels/warn_about_ints.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd0a889c99df47454a5eff1acd646b070d3a4280
--- /dev/null
+++ b/tensorflow/core/kernels/warn_about_ints.cc
@@ -0,0 +1,32 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/warn_about_ints.h"
+
+namespace tensorflow {
+
+void WarnAboutInts(OpKernelConstruction* context) {
+  DataType dtype;
+  OP_REQUIRES_OK(context, context->GetAttr("T", &dtype));
+  if (DataTypeIsInteger(dtype)) {
+    LOG(WARNING) << "Op " << context->def().name() << " of type "
+                 << context->def().op() << " used with integer dtype "
+                 << DataTypeString(dtype)
+                 << ".  This op was registered with integer support "
+                 << "accidentally, and you won't like the result.";
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/warn_about_ints.h b/tensorflow/core/kernels/warn_about_ints.h
new file mode 100644
index 0000000000000000000000000000000000000000..20666b230ece61074af576a6f654a658c593a2a8
--- /dev/null
+++ b/tensorflow/core/kernels/warn_about_ints.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
+#define TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// Warn if a kernel is being created using ints
+// TODO(irving): Remove in TF 2.0 along with the bad op registrations.
+void WarnAboutInts(OpKernelConstruction* context);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 2b10ebeaf7cbed4a8466a69898d6d4d6660ed5cb..c8e514df800550abc07ef8394893d8da09e7ed3d 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -47,7 +47,7 @@ struct EigenEnvironment {
                    const string& name)
       : env_(env), thread_options_(thread_options), name_(name) {}
 
-  EnvThread* CreateThread(std::function<void()> f) {
+  EnvThread* CreateThread(const std::function<void()>& f) {
     return env_->StartThread(thread_options_, name_, [=]() {
       // Set the processor flag to flush denormals to zero.
       port::ScopedFlushDenormal flush;
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index df8887b1c479e6bb70e09826f554e26e5994a2ed..db230b1e56fe4fea491ebbc11560198e8a5aa082 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // Functions to read images in GIF format.
 
 #include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/gif.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
@@ -26,7 +27,7 @@ namespace gif {
 
 int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
   if (gif_file->UserData && memcpy(buf, gif_file->UserData, size)) {
-    gif_file->UserData = ((uint8_t*)gif_file->UserData) + size;
+    gif_file->UserData = reinterpret_cast<uint8_t*>(gif_file->UserData) + size;
     return size;
   }
   return 0;
@@ -37,6 +38,13 @@ uint8* Decode(const void* srcdata, int datasize,
   int error_code = D_GIF_SUCCEEDED;
   GifFileType* gif_file =
       DGifOpen(const_cast<void*>(srcdata), &input_callback, &error_code);
+  const auto cleanup = gtl::MakeCleanup([gif_file]() {
+    int error_code = D_GIF_SUCCEEDED;
+    if (gif_file && DGifCloseFile(gif_file, &error_code) != GIF_OK) {
+      LOG(WARNING) << "Fail to close gif file, reason: "
+                   << GifErrorString(error_code);
+    }
+  });
   if (error_code != D_GIF_SUCCEEDED) {
     LOG(ERROR) << "Fail to open gif file, reason: "
                << GifErrorString(error_code);
@@ -52,12 +60,13 @@ uint8* Decode(const void* srcdata, int datasize,
     return nullptr;
   }
 
-  int num_frames = gif_file->ImageCount;
-  int width = gif_file->SWidth;
-  int height = gif_file->SHeight;
-  int channel = 3;
+  const int num_frames = gif_file->ImageCount;
+  const int width = gif_file->SWidth;
+  const int height = gif_file->SHeight;
+  const int channel = 3;
 
-  uint8* dstdata = allocate_output(num_frames, width, height, channel);
+  uint8* const dstdata = allocate_output(num_frames, width, height, channel);
+  if (!dstdata) return nullptr;
   for (int k = 0; k < num_frames; k++) {
     SavedImage* this_image = &gif_file->SavedImages[k];
     GifImageDesc* img_desc = &this_image->ImageDesc;
@@ -84,10 +93,6 @@ uint8* Decode(const void* srcdata, int datasize,
     }
   }
 
-  if (DGifCloseFile(gif_file, &error_code) != GIF_OK) {
-    LOG(WARNING) << "Fail to close gif file, reason: "
-                 << GifErrorString(error_code);
-  }
   return dstdata;
 }
 
diff --git a/tensorflow/core/lib/gif/testdata/lena.gif b/tensorflow/core/lib/gif/testdata/lena.gif
new file mode 100644
index 0000000000000000000000000000000000000000..12980a3b28af48982e50386f94f9a1c112a10f18
Binary files /dev/null and b/tensorflow/core/lib/gif/testdata/lena.gif differ
diff --git a/tensorflow/core/lib/gtl/inlined_vector_test.cc b/tensorflow/core/lib/gtl/inlined_vector_test.cc
index b957fedc4a04b0709deed5e026c56e5082bf694a..6e3c083f58a7f53e473acd2841a51e0209a2be62 100644
--- a/tensorflow/core/lib/gtl/inlined_vector_test.cc
+++ b/tensorflow/core/lib/gtl/inlined_vector_test.cc
@@ -816,7 +816,7 @@ static void BM_StdVectorFillString(int iters, int len) {
   }
   testing::ItemsProcessed(int64{iters} * len);
   // The purpose of the benchmark is to verify that inlined vector is
-  // efficient when moving is more efficent than copying. To do so, we
+  // efficient when moving is more efficient than copying. To do so, we
   // use strings that are larger than the small string optimization.
   CHECK(!StringRepresentedInline(strings[0]));
 }
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index f9846968afc60328743a6d07da9928fcc8022816..e27904ea12ab9a7f58e2954b1fdb00e074ffa01c 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -337,7 +337,8 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
 uint8* Uncompress(const void* srcdata, int datasize,
                   const UncompressFlags& flags, int64* nwarn,
                   std::function<uint8*(int, int, int)> allocate_output) {
-  FewerArgsForCompiler argball(datasize, flags, nwarn, allocate_output);
+  FewerArgsForCompiler argball(datasize, flags, nwarn,
+                               std::move(allocate_output));
   uint8* const dstdata = UncompressLow(srcdata, &argball);
 
   const float fraction_read =
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index 531ed781095bc1434095bf09f3b6220b7913a1e3..28ff5bf6e8e4d9db6a0c7baef616edba97f56521 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -70,7 +70,7 @@ void FillRandomsWithSingles(PhiloxRandom gen,
 //   z_limit: the maximum z-test we would consider the test to pass;
 template <typename T>
 bool CheckSamplesMoments(const std::vector<T>& samples,
-                         std::function<double(int)> theoretical_moments,
+                         const std::function<double(int)>& theoretical_moments,
                          int max_moments, int stride, T z_limit) {
   const T* const samples_data = &samples[0];
   const int samples_size = samples.size();
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index da369ea516a99381b1260033fc5ea16f631c8c65..c68e14f09fbd4a89ad9cd75a8df94144d0cd2c75 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -25,7 +25,7 @@ namespace str_util {
 
 static char hex_char[] = "0123456789abcdef";
 
-string CEscape(const string& src) {
+string CEscape(StringPiece src) {
   string dest;
 
   for (unsigned char c : src) {
@@ -258,6 +258,25 @@ void TitlecaseString(string* s, StringPiece delimiters) {
   }
 }
 
+string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
+                     bool replace_all) {
+  // TODO(jlebar): We could avoid having to shift data around in the string if
+  // we had a StringPiece::find() overload that searched for a StringPiece.
+  string res = s.ToString();
+  size_t pos = 0;
+  while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
+    res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
+    pos += newsub.size();
+    if (oldsub.empty()) {
+      pos++;  // Match at the beginning of the text and after every byte
+    }
+    if (!replace_all) {
+      break;
+    }
+  }
+  return res;
+}
+
 size_t RemoveLeadingWhitespace(StringPiece* text) {
   size_t count = 0;
   const char* ptr = text->data();
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index bfecfef6cbb2fb2d46468068f70d694150f1b725..669f0d3c5279b90fe31398410c4a95a053d16fd5 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -30,7 +30,7 @@ namespace str_util {
 
 // Returns a version of 'src' where unprintable characters have been
 // escaped using C-style escape sequences.
-string CEscape(const string& src);
+string CEscape(StringPiece src);
 
 // Copies "source" to "dest", rewriting C-style escape sequences --
 // '\n', '\r', '\\', '\ooo', etc -- to their ASCII equivalents.
@@ -85,6 +85,11 @@ string Uppercase(StringPiece s);
 // set of characters that can be used as word boundaries.
 void TitlecaseString(string* s, StringPiece delimiters);
 
+// Replaces the first occurrence (if replace_all is false) or all occurrences
+// (if replace_all is true) of oldsub in s with newsub.
+string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
+                     bool replace_all);
+
 // Join functionality
 template <typename T>
 string Join(const T& s, const char* sep);
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index 486690cf57fa3506724c9f4722d6283ec329dc7d..040f7447e4d2d13a9f679ba92670ee74a866dae3 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -352,4 +352,37 @@ TEST(TitlecaseString, Basic) {
   ASSERT_EQ(s, "Dense");
 }
 
+TEST(StringReplace, Basic) {
+  EXPECT_EQ("XYZ_XYZ_XYZ", str_util::StringReplace("ABC_ABC_ABC", "ABC", "XYZ",
+                                                   /*replace_all=*/true));
+}
+
+TEST(StringReplace, OnlyFirst) {
+  EXPECT_EQ("XYZ_ABC_ABC", str_util::StringReplace("ABC_ABC_ABC", "ABC", "XYZ",
+                                                   /*replace_all=*/false));
+}
+
+TEST(StringReplace, IncreaseLength) {
+  EXPECT_EQ("a b c",
+            str_util::StringReplace("abc", "b", " b ", /*replace_all=*/true));
+}
+
+TEST(StringReplace, IncreaseLengthMultipleMatches) {
+  EXPECT_EQ("a b  b c",
+            str_util::StringReplace("abbc", "b", " b ", /*replace_all=*/true));
+}
+
+TEST(StringReplace, NoChange) {
+  EXPECT_EQ("abc",
+            str_util::StringReplace("abc", "d", "X", /*replace_all=*/true));
+}
+
+TEST(StringReplace, EmptyStringReplaceFirst) {
+  EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/false));
+}
+
+TEST(StringReplace, EmptyStringReplaceAll) {
+  EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/strcat.cc b/tensorflow/core/lib/strings/strcat.cc
index b078e8cf9450fbd4dc7667924affbe7b59900905..3e864c4f2821a00c89eaf5b7e0a0da275bc770a1 100644
--- a/tensorflow/core/lib/strings/strcat.cc
+++ b/tensorflow/core/lib/strings/strcat.cc
@@ -27,8 +27,6 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 
-AlphaNum gEmptyAlphaNum("");
-
 AlphaNum::AlphaNum(const Eigen::half &f)
     : piece_(digits_, strlen(FloatToBuffer(static_cast<float>(f), digits_))) {}
 
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index 9434b9441107851485fcbaa63dc0939e73a55599..8e35549ed4bdd9afa497011c1f10504b59a0f350 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -144,8 +144,6 @@ class AlphaNum {
   TF_DISALLOW_COPY_AND_ASSIGN(AlphaNum);
 };
 
-extern AlphaNum gEmptyAlphaNum;
-
 // ----------------------------------------------------------------------
 // StrCat()
 //    This merges the given strings or numbers, with no delimiter.  This
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index e540ecfa8d989209520060ae7640e2ca87cdb694..b9e56a1742f262f120ee800c0e7aa364be0281a3 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -209,7 +209,7 @@ The input tensors are all required to have size 1 in the first dimension.
 
 For example:
 
-```prettyprint
+```
 # 'x' is [[1, 4]]
 # 'y' is [[2, 5]]
 # 'z' is [[3, 6]]
@@ -277,7 +277,7 @@ Etc.
 
 For example:
 
-```prettyprint
+```
 # 'x' is [1, 4]
 # 'y' is [2, 5]
 # 'z' is [3, 6]
@@ -432,19 +432,19 @@ Computes offsets of concat inputs within its output.
 
 For example:
 
-```prettyprint
+```
 # 'x' is [2, 2, 7]
 # 'y' is [2, 3, 7]
 # 'z' is [2, 5, 7]
 concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
 ```
 
+This is typically used by gradient computations for a concat operation.
+
 concat_dim: The dimension along which to concatenate.
 shape: The `N` int32 vectors representing shape of tensors being concatenated.
 offset: The `N` int32 vectors representing the starting offset
         of input tensors within the concatenated output.
-
-This is typically used by gradient computations for a concat operation.
 )doc");
 
 // --------------------------------------------------------------------------
@@ -518,7 +518,17 @@ REGISTER_OP("SplitV")
       } else if (rank == 0) {
         // Throw error if input is a scalar.
         return errors::InvalidArgument("Can't split scalars");
-      } else if (size_splits == nullptr || !c->ValueKnown(split_dimension)) {
+      } else if (size_splits == nullptr && c->ValueKnown(split_dimension)) {
+        // If split dimension is known, but the sizes are unknown, then
+        // only the split dimension is unknown
+        output_shape = input;
+        TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
+                                         c->Value(split_dimension),
+                                         c->UnknownDim(), &output_shape));
+        for (int i = 0; i < num_outputs; ++i) {
+          c->set_output(i, output_shape);
+        }
+      } else if (size_splits == nullptr && !c->ValueKnown(split_dimension)) {
         // If split dimension or tensor containing the split sizes is unknown,
         // then return unknown shapes of same rank as input.
         output_shape = c->UnknownShapeOfRank(rank);
@@ -540,12 +550,38 @@ REGISTER_OP("SplitV")
           return errors::InvalidArgument(
               "Length of size_splits should be equal to num_outputs");
         }
+        int64_t cumsum_outputs = 0;
+        bool has_neg_one = false;
+        // If the sizes of the splits are known, then
+        // make sure that the sizes add up to the expected
+        // dimension size, with the possibility of a -1.
+        // Specify the full output shapes.
         for (int i = 0; i < num_outputs; ++i) {
           output_shape = c->UnknownShapeOfRank(rank);
           TF_RETURN_IF_ERROR(c->ReplaceDim(input, split_dim,
                                            c->MakeDim(data[i]), &output_shape));
           c->set_output(i, output_shape);
+          if (data[i] == -1 && !has_neg_one)
+            has_neg_one = true;
+          else if (data[i] == -1 && has_neg_one)
+            return errors::InvalidArgument("size_splits can only have one -1");
+          else
+            cumsum_outputs += data[i];
+        }
+        auto split_dim_size = c->Value(c->Dim(input, split_dim));
+        if (has_neg_one) {
+          if (cumsum_outputs < split_dim_size)
+            cumsum_outputs = split_dim_size;
+          else
+            cumsum_outputs = split_dim_size + 1;
         }
+        if (c->ValueKnown(c->Dim(input, split_dim)) &&
+            cumsum_outputs != c->Value(c->Dim(input, split_dim)))
+          return errors::InvalidArgument(
+              "Sum of output sizes must match "
+              "the size of the original Tensor along the split dimension "
+              "or the sum of the positive sizes must be less if it contains a "
+              "-1");
       }
 
       return Status::OK();
@@ -670,7 +706,7 @@ rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 
 For example:
 
-```prettyprint
+```
 # 'diagonal' is [1, 2, 3, 4]
 tf.diag(diagonal) ==> [[1, 0, 0, 0]
                        [0, 2, 0, 0]
@@ -722,7 +758,7 @@ tensor of rank `k` with dimensions `[D1,..., Dk]` where:
 
 For example:
 
-```prettyprint
+```
 # 'input' is [[1, 0, 0, 0]
               [0, 2, 0, 0]
               [0, 0, 3, 0]
@@ -768,7 +804,7 @@ tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
 
 For example:
 
-```prettyprint
+```
 # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
 
 and diagonal.shape = (2, 4)
@@ -880,7 +916,7 @@ The input must be at least a matrix.
 
 For example:
 
-```prettyprint
+```
 # 'input' is [[[1, 0, 0, 0]
                [0, 2, 0, 0]
                [0, 0, 3, 0]
@@ -927,7 +963,7 @@ The indicator function
 
 For example:
 
-```prettyprint
+```
 # if 'input' is [[ 0,  1,  2, 3]
                  [-1,  0,  1, 2]
                  [-2, -1,  0, 1]
@@ -946,7 +982,7 @@ tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
 
 Useful special cases:
 
-```prettyprint
+```
  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
@@ -968,7 +1004,7 @@ REGISTER_OP("Reverse")
     .Output("output: T")
     .Attr(
         "T: {uint8, int8, int32, int64, bool, half, float, double, complex64, "
-        "complex128}")
+        "complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle dims;
@@ -998,7 +1034,7 @@ of `tensor` must equal the number of elements in `dims`. In other words:
 
 For example:
 
-```prettyprint
+```
 # tensor 't' is [[[[ 0,  1,  2,  3],
 #                  [ 4,  5,  6,  7],
 #                  [ 8,  9, 10, 11]],
@@ -1045,7 +1081,7 @@ REGISTER_OP("ReverseV2")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
         "T: {uint8, int8, int32, int64, bool, half, float, double, complex64, "
-        "complex128}")
+        "complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle axis;
@@ -1074,7 +1110,7 @@ once, a InvalidArgument error is raised.
 
 For example:
 
-```prettyprint
+```
 # tensor 't' is [[[[ 0,  1,  2,  3],
 #                  [ 4,  5,  6,  7],
 #                  [ 8,  9, 10, 11]],
@@ -1245,7 +1281,7 @@ This operation creates a tensor of shape `dims` and fills it with `value`.
 
 For example:
 
-```prettyprint
+```
 # Output tensor has shape [2, 3].
 fill([2, 3], 9) ==> [[9, 9, 9]
                      [9, 9, 9]]
@@ -1354,7 +1390,7 @@ out-of-bound indices result in safe but unspecified behavior, which may include
 raising an error.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../../images/Gather.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
 </div>
 )doc");
 
@@ -1396,20 +1432,17 @@ REGISTER_OP("GatherNd")
     .Doc(R"doc(
 Gather values or slices from `params` according to `indices`.
 
-`params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.
+`indices` is an integer tensor containing indices into `params`.  The last
+dimension of `indices` can be at most the rank of `params`:
 
-`indices` must be integer tensor, containing indices into `params`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+    indices.shape[-1] <= params.rank
 
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `params`.
+The last dimension of `indices` corresponds to elements
+(if `indices.shape[-1] = params.rank`) or slices
+(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+of `params`.  The output tensor has shape
 
-Produces an output tensor with shape
-
-```
-[d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].
-```
+    indices.shape[:-1] + params.shape[indices.shape[-1]:]
 
 Some examples below.
 
@@ -1488,10 +1521,10 @@ Batched indexing into a 3-tensor:
     output = [['b0', 'b1'], ['d0', 'c1']]
 ```
 
-params: `P-D`.  The tensor from which to gather values.
-indices: `Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`.
-output: `(P+Q-K-1)-D`.  Values from `params` gathered from indices given by
-  `indices`.
+params: The tensor from which to gather values.
+indices: Index tensor.
+output: Values from `params` gathered from indices given by `indices`, with
+  shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
 )doc");
 
 // --------------------------------------------------------------------------
@@ -1509,6 +1542,23 @@ REGISTER_OP("Identity")
 Return a tensor with the same shape and contents as the input tensor or value.
 )Doc");
 
+#ifdef INTEL_MKL
+REGISTER_OP("_MklIdentity")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      c->set_output_handle_dtype(0, c->input_handle_dtype(0));
+      c->set_output_handle_shape(0, c->input_handle_shape(0));
+      return Status::OK();
+    })
+    .Doc(R"Doc( Mkl implementation of IdentityOp
+)Doc");
+#endif
+
 // --------------------------------------------------------------------------
 REGISTER_OP("RefIdentity")
     .Input("input: Ref(T)")
@@ -1613,7 +1663,7 @@ implied by `shape` must be the same as the number of elements in `tensor`.
 
 For example:
 
-```prettyprint
+```
 # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
 # tensor 't' has shape [9]
 reshape(t, [3, 3]) ==> [[1, 2, 3],
@@ -1700,7 +1750,7 @@ The values must include 0. There can be no duplicate values or negative values.
 
 For example:
 
-```prettyprint
+```
 # tensor `x` is [3, 4, 0, 2, 1]
 invert_permutation(x) ==> [2, 4, 3, 0, 1]
 ```
@@ -1805,7 +1855,7 @@ in the unique output `y`. In other words:
 
 For example:
 
-```prettyprint
+```
 # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
 y, idx = unique(x)
 y ==> [1, 2, 4, 7, 8]
@@ -1845,7 +1895,7 @@ contains the count of each element of `y` in `x`. In other words:
 
 For example:
 
-```prettyprint
+```
 # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
 y, idx, count = unique_with_counts(x)
 y ==> [1, 2, 4, 7, 8]
@@ -1890,7 +1940,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`.
 
 For example:
 
-```prettyprint
+```
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 shape(t) ==> [2, 2, 3]
 ```
@@ -1971,7 +2021,7 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
 
 For example:
 
-```prettyprint
+```
 # Given this:
 batch_dim = 0
 seq_dim = 1
@@ -1993,7 +2043,7 @@ output[3, 2:, :, ...] = input[3, 2:, :, ...]
 
 In contrast, if:
 
-```prettyprint
+```
 # Given this:
 batch_dim = 2
 seq_dim = 0
@@ -2034,7 +2084,7 @@ This operation returns an integer representing the rank of `input`.
 
 For example:
 
-```prettyprint
+```
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 # shape of tensor 't' is [2, 2, 3]
 rank(t) ==> 3
@@ -2060,7 +2110,7 @@ This operation returns an integer representing the number of elements in
 
 For example:
 
-```prettyprint
+```
 # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
 size(t) ==> 12
 ```
@@ -2293,7 +2343,7 @@ encoding is best understand by considering a non-trivial example. In
 particular,
 `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
 
-```prettyprint
+```
 begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
 end = [2, 4, x, x, -3, x]
 strides = [1, 1, x, x, -1, 1]
@@ -2435,6 +2485,32 @@ shape must be exactly the shape produced by the slice of `ref`.
 // broadcasting.
 // --------------------------------------------------------------------------
 
+REGISTER_OP("ResourceStridedSliceAssign")
+    .Input("ref: resource")
+    .Input("begin: Index")
+    .Input("end: Index")
+    .Input("strides: Index")
+    .Input("value: T")
+    .Attr("T: type")
+    .Attr("Index: {int32, int64}")
+    .Attr("begin_mask: int = 0")
+    .Attr("end_mask: int = 0")
+    .Attr("ellipsis_mask: int = 0")
+    .Attr("new_axis_mask: int = 0")
+    .Attr("shrink_axis_mask: int = 0")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Assign `value` to the sliced l-value reference of `ref`.
+
+The values of `value` are assigned to the positions in the variable
+`ref` that are selected by the slice parameters. The slice parameters
+`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+
+NOTE this op currently does not support broadcasting and so `value`'s
+shape must be exactly the shape produced by the slice of `ref`.
+
+)doc");
+
 REGISTER_OP("Tile")
     .Input("input: T")
     .Input("multiples: Tmultiples")
@@ -2515,7 +2591,7 @@ the output tensor can vary depending on how many true values there are in
 
 For example:
 
-```prettyprint
+```
 # 'input' tensor is [[True, False]
 #                    [True, False]]
 # 'input' has two true values, so output has two coordinates.
@@ -2619,7 +2695,7 @@ The padded size of each dimension D of the output is:
 
 For example:
 
-```prettyprint
+```
 # 't' is [[1, 1], [2, 2]]
 # 'paddings' is [[1, 1], [2, 2]]
 # rank of 't' is 2
@@ -2658,7 +2734,7 @@ The padded size of each dimension D of the output is:
 
 For example:
 
-```prettyprint
+```
 # 't' is [[1, 2, 3], [4, 5, 6]].
 # 'paddings' is [[1, 1]], [2, 2]].
 # 'mode' is SYMMETRIC.
@@ -2754,7 +2830,7 @@ The folded size of each dimension D of the output is:
 
 For example:
 
-```prettyprint
+```
 # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
 # 'paddings' is [[0, 1]], [0, 1]].
 # 'mode' is SYMMETRIC.
@@ -2930,7 +3006,7 @@ which will make the shape `[1, height, width, channels]`.
 
 Other examples:
 
-```prettyprint
+```
 # 't' is a tensor of shape [2]
 shape(expand_dims(t, 0)) ==> [1, 2]
 shape(expand_dims(t, 1)) ==> [2, 1]
@@ -3032,14 +3108,14 @@ dimensions, you can remove specific size 1 dimensions by specifying
 
 For example:
 
-```prettyprint
+```
 # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 shape(squeeze(t)) ==> [2, 3]
 ```
 
 Or, to remove specific size 1 dimensions:
 
-```prettyprint
+```
 # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 ```
@@ -3082,14 +3158,14 @@ position of each `out` element in `x`. In other words:
 
 For example, given this input:
 
-```prettyprint
+```
 x = [1, 2, 3, 4, 5, 6]
 y = [1, 3, 5]
 ```
 
 This operation would return:
 
-```prettyprint
+```
 out ==> [2, 4, 6]
 idx ==> [1, 3, 5]
 ```
@@ -3348,34 +3424,34 @@ Some examples:
 (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
     `paddings = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 1]` and value:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
     `paddings = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 3]` and value:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
     `paddings = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]],
       [[9],  [10], [11],  [12]],
@@ -3384,7 +3460,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[4, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3394,7 +3470,7 @@ x = [[[[1], [3]], [[9], [11]]],
 (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
     paddings = `[[0, 0], [2, 0]]`:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]]],
      [[[9],  [10], [11],  [12]],
@@ -3403,7 +3479,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[8, 1, 3, 1]` and value:
 
-```prettyprint
+```
 x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
      [[[0], [2], [4]]], [[[0], [10], [12]]],
      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -3477,32 +3553,32 @@ Some examples:
 
 (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 1]` and value:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 3]` and value:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]],
       [[9],  [10], [11],  [12]],
@@ -3511,7 +3587,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[4, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3520,7 +3596,7 @@ x = [[[[1], [3]], [[9], [11]]],
 
 (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]]],
      [[[9],  [10], [11],  [12]],
@@ -3529,7 +3605,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[8, 1, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 ```
@@ -3615,26 +3691,26 @@ Some examples:
 (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 3]` and value:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
@@ -3642,7 +3718,7 @@ x = [[[[1, 2, 3], [4, 5, 6]],
 (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3651,7 +3727,7 @@ x = [[[[1], [3]], [[9], [11]]],
 
 The output tensor has shape `[1, 4, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[1],   [2],  [3],  [4]],
      [[5],   [6],  [7],  [8]],
      [[9],  [10], [11],  [12]],
@@ -3661,7 +3737,7 @@ x = [[[1],   [2],  [3],  [4]],
 (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [2, 0]]`:
 
-```prettyprint
+```
 x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
      [[[0], [2], [4]]], [[[0], [10], [12]]],
      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -3670,7 +3746,7 @@ x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
 
 The output tensor has shape `[2, 2, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]]],
      [[[9],  [10], [11],  [12]],
@@ -3735,32 +3811,32 @@ Some examples:
 
 (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 3]` and value:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
 
 (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3769,7 +3845,7 @@ x = [[[[1], [3]], [[9], [11]]],
 
 The output tensor has shape `[1, 4, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[1],   [2],  [3],  [4]],
      [[5],   [6],  [7],  [8]],
      [[9],  [10], [11],  [12]],
@@ -3778,14 +3854,14 @@ x = [[[1],   [2],  [3],  [4]],
 
 (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 ```
 
 The output tensor has shape `[2, 2, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[5], [7]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3851,14 +3927,14 @@ purely convolutional models.
 
 For example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [2]],
       [[3], [4]]]]
 ```
 
 This operation will output a tensor of shape `[1, 1, 1, 4]`:
 
-```prettyprint
+```
 [[[[1, 2, 3, 4]]]]
 ```
 
@@ -3869,7 +3945,7 @@ The output element shape is `[1, 1, 4]`.
 
 For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
@@ -3877,13 +3953,13 @@ x = [[[[1, 2, 3], [4, 5, 6]],
 This operation, for block_size of 2, will return the following tensor of shape
 `[1, 1, 1, 12]`
 
-```prettyprint
+```
 [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 ```
 
 Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [5],  [6]],
       [[3],   [4],  [7],  [8]],
       [[9],  [10], [13],  [14]],
@@ -3892,7 +3968,7 @@ x = [[[[1],   [2],  [5],  [6]],
 
 the operator will return the following tensor of shape `[1 2 2 4]`:
 
-```prettyprint
+```
 x = [[[[1, 2, 3, 4],
        [5, 6, 7, 8]],
       [[9, 10, 11, 12],
@@ -3961,14 +4037,14 @@ purely convolutional models.
 
 For example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:
 
-```prettyprint
+```
 x = [[[[1, 2, 3, 4]]]]
 
 ```
 
 This operation will output a tensor of shape `[1, 2, 2, 1]`:
 
-```prettyprint
+```
    [[[[1], [2]],
      [[3], [4]]]]
 ```
@@ -3980,14 +4056,14 @@ The output element shape is `[2, 2, 1]`.
 
 For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
 
-```prettyprint
+```
 x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 ```
 
 This operation, for block size of 2, will return the following tensor of shape
 `[1, 2, 2, 3]`
 
-```prettyprint
+```
    [[[[1, 2, 3], [4, 5, 6]],
      [[7, 8, 9], [10, 11, 12]]]]
 
@@ -3995,7 +4071,7 @@ This operation, for block size of 2, will return the following tensor of shape
 
 Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
 
-```prettyprint
+```
 x =  [[[[1, 2, 3, 4],
        [5, 6, 7, 8]],
       [[9, 10, 11, 12],
@@ -4004,7 +4080,7 @@ x =  [[[[1, 2, 3, 4],
 
 the operator will return the following tensor of shape `[1 4 4 1]`:
 
-```prettyprint
+```
 x = [[ [1],   [2],  [5],  [6]],
      [ [3],   [4],  [7],  [8]],
      [ [9],  [10], [13],  [14]],
@@ -4750,37 +4826,35 @@ REGISTER_OP("ScatterNd")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape)
-    .Doc(
-        R"doc(Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` tensor according to
+    .Doc(R"doc(
+Scatter `updates` into a new (initially zero) tensor according to `indices`.
+
+Creates a new tensor by applying sparse `updates` to individual
+values or slices within a zero tensor of the given `shape` according to
 indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
 operator which extracts values or slices from a given tensor.
 
-TODO(simister): Add a link to Variable.__getitem__ documentation on slice
-syntax.
-
-`shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank
-`Q`.
+**WARNING**: The order in which updates are applied is nondeterministic, so the
+output will be nondeterministic if `indices` contains duplicates.
 
-`indices` must be integer tensor, containing indices into `shape`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
 
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `shape`.
+    indices.shape[-1] <= shape.rank
 
-`updates` is Tensor of rank `Q-1+P-K` with shape:
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
 
-```
-[d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].
-```
+    indices.shape[:-1] + shape[indices.shape[-1]:]
 
 The simplest form of scatter is to insert individual elements in a tensor by
 index. For example, say we want to insert 4 scattered elements in a rank-1
 tensor with 8 elements.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd1.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
 </div>
 
 In Python, this scatter operation would look like this:
@@ -4791,7 +4865,7 @@ In Python, this scatter operation would look like this:
     shape = tf.constant([8])
     scatter = tf.scatter_nd(indices, updates, shape)
     with tf.Session() as sess:
-      print sess.run(scatter)
+      print(sess.run(scatter))
 ```
 
 The resulting tensor would look like this:
@@ -4803,7 +4877,7 @@ example, if we wanted to insert two slices in the first dimension of a
 rank-3 tensor with two matrices of new values.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd2.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
 </div>
 
 In Python, this scatter operation would look like this:
@@ -4817,7 +4891,7 @@ In Python, this scatter operation would look like this:
     shape = tf.constant([4, 4, 4])
     scatter = tf.scatter_nd(indices, updates, shape)
     with tf.Session() as sess:
-      print sess.run(scatter)
+      print(sess.run(scatter))
 ```
 
 The resulting tensor would look like this:
@@ -4827,11 +4901,9 @@ The resulting tensor would look like this:
      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
 
-indices: A Tensor. Must be one of the following types: int32, int64.
-  A tensor of indices into ref.
-updates: A Tensor. Must have the same type as tensor. A tensor of updated values
-  to store in ref.
-shape: A vector. The shape of the resulting tensor.
+indices: Index tensor.
+updates: Updates to scatter into output.
+shape: 1-D. The shape of the resulting tensor.
 output: A new tensor with the given shape and updates applied according
   to the indices.
 )doc");
@@ -4839,6 +4911,7 @@ output: A new tensor with the given shape and updates applied according
 REGISTER_OP("FakeQuantWithMinMaxArgs")
     .Attr("min: float = -6.0")
     .Attr("max: float = 6.0")
+    .Attr("num_bits: int = 8")
     .Input("inputs: float")
     .Output("outputs: float")
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -4848,6 +4921,7 @@ Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
 Attributes [min; max] define the clamping range for the 'inputs' data.  Op
 divides this range into 255 steps (total of 256 values), then replaces each
 'inputs' value with the closest of the quantized step values.
+'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 
 Quantization is called fake since the output is still in floating point.
 )doc");
@@ -4855,6 +4929,7 @@ Quantization is called fake since the output is still in floating point.
 REGISTER_OP("FakeQuantWithMinMaxArgsGradient")
     .Attr("min: float = -6.0")
     .Attr("max: float = 6.0")
+    .Attr("num_bits: int = 8")
     .Input("gradients: float")
     .Input("inputs: float")
     .Output("backprops: float")
@@ -4869,6 +4944,7 @@ backprops: Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVars")
+    .Attr("num_bits: int = 8")
     .Input("inputs: float")
     .Input("min: float")
     .Input("max: float")
@@ -4887,11 +4963,13 @@ and `max` to 'outputs' tensor of same shape as `inputs`.
 [min; max] is the clamping range for the 'inputs' data.  Op divides this range
 into 255 steps (total of 256 values), then replaces each 'inputs' value with the
 closest of the quantized step values.
+'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 
 This operation has a gradient and thus allows for training `min` and `max` values.
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVarsGradient")
+    .Attr("num_bits: int = 8")
     .Input("gradients: float")
     .Input("inputs: float")
     .Input("min: float")
@@ -4920,6 +4998,7 @@ Compute gradients for a FakeQuantWithMinMaxVars operation.
 gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
 inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
 min, max: Quantization interval, scalar floats.
+num_bits: The bitwidth of the quantization; between 2 and 8, inclusive.
 backprops_wrt_input: Backpropagated gradients w.r.t. inputs:
   `gradients * (inputs >= min && inputs <= max)`.
 backprop_wrt_min: Backpropagated gradients w.r.t. min parameter:
@@ -4929,6 +5008,7 @@ backprop_wrt_max: Backpropagated gradients w.r.t. max parameter:
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVarsPerChannel")
+    .Attr("num_bits: int = 8")
     .Input("inputs: float")
     .Input("min: float")
     .Input("max: float")
@@ -4955,11 +5035,13 @@ to 'outputs' tensor of same shape as `inputs`.
 [min; max] is the clamping range for the 'inputs' data in the corresponding
 depth channel.  Op divides this range into 255 steps (total of 256 values), then
 replaces each 'inputs' value with the closest of the quantized step values.
+'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 
 This operation has a gradient and thus allows for training `min` and `max` values.
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVarsPerChannelGradient")
+    .Attr("num_bits: int = 8")
     .Input("gradients: float")
     .Input("inputs: float")
     .Input("min: float")
@@ -4993,6 +5075,7 @@ gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
 inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
   same as `gradients`.
 min, max: Quantization interval, floats of shape `[d]`.
+num_bits: The bitwidth of the quantization; between 2 and 8, inclusive.
 backprops_wrt_input: Backpropagated gradients w.r.t. inputs, shape same as
   `inputs`:
     `gradients * (inputs >= min && inputs <= max)`.
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 037c393574dcbe4ea6bf705b5048b657e97573df..18700be67a667359d7a86d8f81ada383be973a0a 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -55,6 +55,7 @@ REGISTER_OP("UniformCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a uniform distribution.
 
@@ -80,7 +81,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -103,6 +104,7 @@ REGISTER_OP("LogUniformCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a log-uniform distribution.
 
@@ -129,7 +131,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -152,6 +154,7 @@ REGISTER_OP("LearnedUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -177,7 +180,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -200,6 +203,7 @@ REGISTER_OP("ThreadUnsafeUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -225,7 +229,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -254,6 +258,7 @@ REGISTER_OP("FixedUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -284,7 +289,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -329,6 +334,7 @@ REGISTER_OP("AllCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -354,7 +360,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to produce per batch.
+num_sampled: Number of candidates to produce.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index dfec411ca5d74bbc305399cc15bf8c19ff7f7edd..22d9c351164f15e595f6ba980e18eaf531de7946 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -487,6 +487,56 @@ op {
     }
   }
 }
+op {
+  name: "AllCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Any"
   input_arg {
@@ -1666,6 +1716,31 @@ op {
     }
   }
 }
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "AudioSpectrogram"
   input_arg {
@@ -1860,17 +1935,68 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "AvgPool3D"
   input_arg {
@@ -1910,18 +2036,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -1978,18 +2092,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2037,18 +2139,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2109,18 +2199,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2242,17 +2320,72 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Barrier"
   output_arg {
@@ -3485,11 +3618,38 @@ op {
   }
 }
 op {
-  name: "CTCBeamSearchDecoder"
+  name: "Bucketize"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+  }
+}
+op {
+  name: "CTCBeamSearchDecoder"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
   input_arg {
     name: "sequence_length"
     type: DT_INT32
@@ -3608,6 +3768,54 @@ op {
     }
   }
 }
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "Cast"
   input_arg {
@@ -4040,7 +4248,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4104,7 +4311,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4168,7 +4374,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4228,18 +4433,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4281,18 +4474,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4351,18 +4532,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4411,18 +4580,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4468,18 +4625,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4538,18 +4683,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4598,18 +4731,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4655,18 +4776,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -6596,17 +6705,33 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "EluGrad"
   input_arg {
@@ -6628,17 +6753,37 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "EncodeBase64"
   input_arg {
@@ -7252,17 +7397,13 @@ op {
   }
 }
 op {
-  name: "FakeQuantWithMinMaxArgsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
+  name: "FakeQuantWithMinMaxArgs"
   input_arg {
     name: "inputs"
     type: DT_FLOAT
   }
   output_arg {
-    name: "backprops"
+    name: "outputs"
     type: DT_FLOAT
   }
   attr {
@@ -7279,15 +7420,106 @@ op {
       f: 6
     }
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
 }
 op {
-  name: "FakeQuantWithMinMaxVars"
+  name: "FakeQuantWithMinMaxArgsGradient"
   input_arg {
-    name: "inputs"
+    name: "gradients"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min"
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
     type: DT_FLOAT
   }
   input_arg {
@@ -7298,6 +7530,13 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
 }
 op {
   name: "FakeQuantWithMinMaxVarsGradient"
@@ -7330,6 +7569,63 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "FakeQuantWithMinMaxVarsPerChannel"
   input_arg {
@@ -7348,6 +7644,13 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
 }
 op {
   name: "FakeQuantWithMinMaxVarsPerChannelGradient"
@@ -7380,6 +7683,44 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
 op {
   name: "FakeQueue"
   input_arg {
@@ -7689,6 +8030,108 @@ op {
     }
   }
 }
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Floor"
   input_arg {
@@ -8052,19 +8495,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -8138,19 +8568,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -8199,9 +8616,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -8257,9 +8672,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -8565,6 +8978,43 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "HistogramSummary"
   input_arg {
@@ -8996,6 +9446,70 @@ op {
     }
   }
 }
+op {
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "Inv"
   input_arg {
@@ -9180,22 +9694,33 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
 }
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "LRN"
   input_arg {
@@ -9363,6 +9888,62 @@ op {
     }
   }
 }
+op {
+  name: "LearnedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Less"
   input_arg {
@@ -9652,6 +10233,62 @@ op {
     }
   }
 }
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "LogicalAnd"
   input_arg {
@@ -9719,6 +10356,30 @@ op {
     type: "type"
   }
 }
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "LookupTableFind"
   input_arg {
@@ -9747,6 +10408,34 @@ op {
     type: "type"
   }
 }
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "LookupTableImport"
   input_arg {
@@ -9771,6 +10460,30 @@ op {
     type: "type"
   }
 }
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "LookupTableInsert"
   input_arg {
@@ -9795,6 +10508,30 @@ op {
     type: "type"
   }
 }
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "LookupTableSize"
   input_arg {
@@ -9807,6 +10544,18 @@ op {
     type: DT_INT64
   }
 }
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
 op {
   name: "LoopCond"
   input_arg {
@@ -10322,19 +11071,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -10390,19 +11126,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -10453,19 +11176,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -10529,19 +11239,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -10608,44 +11305,18 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
-    }
-  }
-  attr {
-    name: "TInput"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
       }
     }
   }
@@ -10709,14 +11380,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -11797,6 +12460,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "MutableDenseHashTableV2"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "MutableHashTable"
   output_arg {
@@ -11881,6 +12607,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "Neg"
   input_arg {
@@ -17612,6 +18420,79 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Restore"
   input_arg {
@@ -17726,6 +18607,40 @@ op {
     }
   }
 }
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "ReverseSequence"
   input_arg {
@@ -17815,6 +18730,53 @@ op {
     }
   }
 }
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "Rint"
   input_arg {
@@ -20811,6 +21773,99 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "SparseCross"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hashed_output"
+    type: "bool"
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hash_key"
+    type: "int"
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "SparseDenseCwiseAdd"
   input_arg {
@@ -24151,6 +25206,62 @@ op {
     }
   }
 }
+op {
+  name: "ThreadUnsafeUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Tile"
   input_arg {
@@ -24490,6 +25601,62 @@ op {
     }
   }
 }
+op {
+  name: "UniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unique"
   input_arg {
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 57224c365d6ed57a450eedf24031a6399ee20229..95f3f0da9db0b0e42fee670b0ff771a6c20324b6 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -331,7 +331,10 @@ REGISTER_OP("Abort")
     .Attr("exit_without_error: bool = false")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
-Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal.
+Raise a exception to abort the process when called.
+
+If exit_without_error is true, the process will exit normally,
+otherwise it will exit with a SIGABORT signal.
 
 Returns nothing but an exception.
 
diff --git a/tensorflow/core/ops/ctc_ops.cc b/tensorflow/core/ops/ctc_ops.cc
index c94ce577c0beb8da9cff476a1528c7d67c6df22a..3d8c53393560e9cc31549f6574db72fbfbe35ce9 100644
--- a/tensorflow/core/ops/ctc_ops.cc
+++ b/tensorflow/core/ops/ctc_ops.cc
@@ -31,6 +31,7 @@ REGISTER_OP("CTCLoss")
     .Input("sequence_length: int32")
     .Attr("preprocess_collapse_repeated: bool = false")
     .Attr("ctc_merge_repeated: bool = true")
+    .Attr("ignore_longer_outputs_than_inputs: bool = false")
     .Output("loss: float")
     .Output("gradient: float")
     .SetShapeFn([](InferenceContext* c) {
@@ -75,6 +76,9 @@ preprocess_collapse_repeated: Scalar, if true then repeated labels are
 ctc_merge_repeated: Scalar.  If set to false, *during* CTC calculation
   repeated non-blank labels will not be merged and are interpreted as
   individual labels.  This is a simplified version of CTC.
+ignore_longer_outputs_than_inputs: Scalar. If set to true, during CTC
+  calculation items have longer input sequences than output sequences
+  are ignored by returning zero-gradient for those items.
 loss: A vector (batch) containing log-probabilities.
 gradient: The gradient of `loss`.  3-D, shape:
   `(max_time x batch_size x num_classes)`.
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index cb886ae8fa26d8a86a07d2f4191cf4059f205285..c80ff983cfa3e830ddff844a882dd33b36c4d623 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -101,8 +101,10 @@ For example:
     outputs[1] = [30, 40]
 ```
 
+See `dynamic_stitch` for an example on how to merge partitions back.
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicPartition.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
 </div>
 
 partitions: Any shape.  Indices in the range `[0, num_partitions)`.
@@ -189,8 +191,26 @@ For example:
               [51, 52], [61, 62]]
 ```
 
+This method can be used to merge partitions created by `dynamic_partition`
+as illustrated on the following example:
+
+```python
+    # Apply function (increments x_i) on elements for which a certain condition
+    # apply (x_i != -1 in this example).
+    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+    condition_mask=tf.not_equal(x,tf.constant(-1.))
+    partitioned_data = tf.dynamic_partition(
+        x, tf.cast(condition_mask, tf.int32) , 2)
+    partitioned_data[1] = partitioned_data[1] + 1.0
+    condition_indices = tf.dynamic_partition(
+        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+    x = tf.dynamic_stitch(condition_indices, partitioned_data)
+    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+    # unchanged.
+```
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicStitch.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
 </div>
 )doc");
 
@@ -210,10 +230,29 @@ Status TwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+  for (int i = 1; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
 Status TwoElementOutput(InferenceContext* c) {
   c->set_output(0, c->Vector(2));
   return Status::OK();
 }
+
+Status ScalarOutput(InferenceContext* c) {
+  c->set_output(0, c->Scalar());
+  return Status::OK();
+}
 }  // namespace
 
 REGISTER_OP("RandomShuffleQueue")
@@ -604,7 +643,17 @@ REGISTER_OP("QueueDequeueV2")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](InferenceContext* c) {
+      if (c->num_outputs() == 1) {
+        c->set_output(0, c->input_handle_shape(0));
+      } else {
+        // TODO(vrv): handle the case of multiple outputs.
+        for (int i = 0; i < c->num_outputs(); ++i) {
+          c->set_output(i, c->UnknownShape());
+        }
+      }
+      return Status::OK();
+    })
     .Doc(R"doc(
 Dequeues a tuple of one or more tensors from the given queue.
 
@@ -631,20 +680,20 @@ REGISTER_OP("QueueDequeueMany")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
-If the queue is closed and there are fewer than n elements, then an
+If the queue is closed and there are fewer than `n` elements, then an
 OutOfRange error is returned.
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
+in the dequeued tuple will have size `n` in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
-N.B. If the queue is empty, this operation will block until n elements
+N.B. If the queue is empty, this operation will block until `n` elements
 have been dequeued (or 'timeout_ms' elapses, if specified).
 
 handle: The handle to a queue.
@@ -664,20 +713,20 @@ REGISTER_OP("QueueDequeueManyV2")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
-If the queue is closed and there are fewer than n elements, then an
+If the queue is closed and there are fewer than `n` elements, then an
 OutOfRange error is returned.
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
+in the dequeued tuple will have size `n` in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
-N.B. If the queue is empty, this operation will block until n elements
+N.B. If the queue is empty, this operation will block until `n` elements
 have been dequeued (or 'timeout_ms' elapses, if specified).
 
 handle: The handle to a queue.
@@ -697,24 +746,24 @@ REGISTER_OP("QueueDequeueUpTo")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
 This operation is not supported by all queues.  If a queue does not support
 DequeueUpTo, then an Unimplemented error is returned.
 
-If the queue is closed and there are more than 0 but less than n elements
-remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If the queue
-is closed and there are 0 elements left in the queue, then an OutOfRange
-error is returned just like in QueueDequeueMany.  Otherwise the behavior
-is identical to QueueDequeueMany:
+If the queue is closed and there are more than 0 but less than `n`
+elements remaining, then instead of returning an OutOfRange error like
+QueueDequeueMany, less than `n` elements are returned immediately.  If
+the queue is closed and there are 0 elements left in the queue, then
+an OutOfRange error is returned just like in QueueDequeueMany.
+Otherwise the behavior is identical to QueueDequeueMany:
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
+in the dequeued tuple will have size `n` in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has k outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
 handle: The handle to a queue.
@@ -734,24 +783,24 @@ REGISTER_OP("QueueDequeueUpToV2")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
 This operation is not supported by all queues.  If a queue does not support
 DequeueUpTo, then an Unimplemented error is returned.
 
-If the queue is closed and there are more than 0 but less than n elements
-remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If the queue
-is closed and there are 0 elements left in the queue, then an OutOfRange
-error is returned just like in QueueDequeueMany.  Otherwise the behavior
-is identical to QueueDequeueMany:
+If the queue is closed and there are more than 0 but less than `n`
+elements remaining, then instead of returning an OutOfRange error like
+QueueDequeueMany, less than `n` elements are returned immediately.  If
+the queue is closed and there are 0 elements left in the queue, then
+an OutOfRange error is returned just like in QueueDequeueMany.
+Otherwise the behavior is identical to QueueDequeueMany:
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
 in the dequeued tuple will have size n in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
 handle: The handle to a queue.
@@ -843,8 +892,10 @@ REGISTER_OP("AccumulatorSetGlobalStep")
       return Status::OK();
     })
     .Doc(R"doc(
-Updates the accumulator with a new value for global_step. Logs warning if the
-accumulator's value is already higher than new_global_step.
+Updates the accumulator with a new value for global_step.
+
+Logs warning if the accumulator's value is already higher than
+new_global_step.
 
 handle: The handle to an accumulator.
 new_global_step: The new global_step value to set.
@@ -862,20 +913,22 @@ REGISTER_OP("ConditionalAccumulator")
       return Status::OK();
     })
     .Doc(R"doc(
-A conditional accumulator for aggregating gradients. The accumulator accepts
-gradients marked with local_step greater or equal to the most recent global_step
-known to the accumulator. The average can be extracted from the accumulator,
-provided sufficient gradients have been accumulated. Extracting the average
-automatically resets the aggregate to 0, and increments the global_step recorded
-by the accumulator.
+A conditional accumulator for aggregating gradients.
+
+The accumulator accepts gradients marked with local_step greater or
+equal to the most recent global_step known to the accumulator. The
+average can be extracted from the accumulator, provided sufficient
+gradients have been accumulated. Extracting the average automatically
+resets the aggregate to 0, and increments the global_step recorded by
+the accumulator.
 
 handle: The handle to the accumulator.
 dtype: The type of the value being accumulated.
 shape: The shape of the values, can be [], in which case shape is unknown.
 container: If non-empty, this accumulator is placed in the given container.
   Otherwise, a default container is used.
-shared_name: If non-empty, this accumulator will be shared under the given name
-  across multiple sessions.
+shared_name: If non-empty, this accumulator will be shared under the
+  given name across multiple sessions.
 )doc");
 
 REGISTER_OP("AccumulatorApplyGradient")
@@ -889,8 +942,9 @@ REGISTER_OP("AccumulatorApplyGradient")
       return Status::OK();
     })
     .Doc(R"doc(
-Applies a gradient to a given accumulator. Does not add if local_step is lesser
-than the accumulator's global_step.
+Applies a gradient to a given accumulator.
+
+Does not add if local_step is lesser than the accumulator's global_step.
 
 handle: The handle to a accumulator.
 local_step: The local_step value at which the gradient was computed.
@@ -913,13 +967,13 @@ REGISTER_OP("AccumulatorTakeGradient")
     })
     .Attr("dtype: numbertype")
     .Doc(R"doc(
-Extracts the average gradient in the given ConditionalAccumulator, provided
-that sufficient (i.e., more than num_required) gradients have been accumulated.
-The op blocks until sufficient gradients have been accumulated.
-If the accumulator has already aggregated more than num_required gradients, it
-returns the average of the accumulated gradients.
-Also automatically increments the recorded global_step in the accumulator by 1,
-and resets the aggregate to 0.
+Extracts the average gradient in the given ConditionalAccumulator.
+
+The op blocks until sufficient (i.e., more than num_required)
+gradients have been accumulated.  If the accumulator has already
+aggregated more than num_required gradients, it returns the average of
+the accumulated gradients.  Also automatically increments the recorded
+global_step in the accumulator by 1, and resets the aggregate to 0.
 
 handle: The handle to an accumulator.
 num_required: Number of gradients required before we return an aggregate.
@@ -940,12 +994,14 @@ REGISTER_OP("SparseConditionalAccumulator")
       return Status::OK();
     })
     .Doc(R"doc(
-A conditional accumulator for aggregating sparse gradients. The accumulator
-accepts gradients marked with local_step greater or equal to the most recent
-global_step known to the accumulator. The average can be extracted from the
-accumulator, provided sufficient gradients have been accumulated. Extracting the
-average automatically resets the aggregate to 0, and increments the global_step
-recorded by the accumulator.
+A conditional accumulator for aggregating sparse gradients.
+
+The accumulator accepts gradients marked with local_step greater or
+equal to the most recent global_step known to the accumulator. The
+average can be extracted from the accumulator, provided sufficient
+gradients have been accumulated. Extracting the average automatically
+resets the aggregate to 0, and increments the global_step recorded by
+the accumulator.
 
 handle: The handle to the accumulator.
 dtype: The type of the value being accumulated.
@@ -970,8 +1026,10 @@ REGISTER_OP("SparseAccumulatorApplyGradient")
       return Status::OK();
     })
     .Doc(R"doc(
-Applies a sparse gradient to a given accumulator. Does not add if local_step is
-lesser than the accumulator's global_step.
+Applies a sparse gradient to a given accumulator.
+
+Does not add if local_step is smaller than the accumulator's
+global_step.
 
 handle: The handle to a accumulator.
 local_step: The local_step value at which the sparse gradient was computed.
@@ -1003,13 +1061,14 @@ REGISTER_OP("SparseAccumulatorTakeGradient")
       return shape_inference::UnknownShape(c);
     })
     .Doc(R"doc(
-Extracts the average sparse gradient in the given SparseConditionalAccumulator,
-provided that sufficient (i.e., more than num_required) gradients have been
-accumulated. The op will blocks until sufficient gradients have been
-accumulated. If the accumulator has already aggregated more than num_required
-gradients, it will return its average of the accumulated gradients.
-Also automatically increments the recorded global_step in the accumulator by 1,
-and resets the aggregate to 0.
+Extracts the average sparse gradient in a SparseConditionalAccumulator.
+
+The op will blocks until sufficient (i.e., more than num_required)
+gradients have been accumulated. If the accumulator has already
+aggregated more than num_required gradients, it will return its
+average of the accumulated gradients.  Also automatically increments
+the recorded global_step in the accumulator by 1, and resets the
+aggregate to 0.
 
 handle: The handle to a SparseConditionalAccumulator.
 num_required: Number of gradients required before we return an aggregate.
@@ -1043,7 +1102,10 @@ REGISTER_OP("StackPush")
     .Output("output: T")
     .Attr("T: type")
     .Attr("swap_memory: bool = false")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Push an element onto the stack.
 
@@ -1095,8 +1157,9 @@ REGISTER_OP("TensorArrayV3")
       return Status::OK();
     })
     .Doc(R"doc(
-An array of Tensors of given size, with data written via Write and read
-via Read or Pack.
+An array of Tensors of given size.
+
+Write data via Write and read via Read or Pack.
 
 handle: The handle to the TensorArray.
 flow: A scalar used to control gradient flow.
@@ -1412,8 +1475,10 @@ REGISTER_OP("TensorArrayCloseV3")
       return Status::OK();
     })
     .Doc(R"doc(
-Delete the TensorArray from its resource container.  This enables
-the user to close and release the resource in the middle of a step/run.
+Delete the TensorArray from its resource container.
+
+This enables the user to close and release the resource in the middle
+of a step/run.
 
 handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 )doc");
@@ -1844,311 +1909,6 @@ size: The number of incomplete elements (i.e. those with some of their value
 
 // --------------------------------------------------------------------------
 
-REGISTER_OP("LookupTableFind")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tin")
-    .Input("default_value: Tout")
-    .Output("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      // Default value must be scalar or vector.
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
-      c->set_output(0, c->UnknownShape());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Looks up keys in a table, outputs the corresponding values.
-
-The tensor `keys` must of the same type as the keys of the table.
-The output `values` is of the type of the table values.
-
-The scalar `default_value` is the value output for keys not present in the
-table. It must also be of the same type as the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Same shape as `keys`.  Values found in the table, or `default_values`
-   for missing keys.
-)doc");
-
-REGISTER_OP("LookupTableInsert")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tin")
-    .Input("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      // TODO: Validate keys and values shape.
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Updates the table to associates keys with values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
-)doc");
-
-REGISTER_OP("LookupTableSize")
-    .Input("table_handle: Ref(string)")
-    .Output("size: int64")
-    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Doc(R"doc(
-Computes the number of elements in the given table.
-
-table_handle: Handle to the table.
-size: Scalar that contains number of elements in the table.
-)doc");
-
-REGISTER_OP("LookupTableExport")
-    .Input("table_handle: Ref(string)")
-    .Output("keys: Tkeys")
-    .Output("values: Tvalues")
-    .Attr("Tkeys: type")
-    .Attr("Tvalues: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      ShapeHandle values = c->UnknownShape();
-      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
-      ShapeHandle keys = c->Vector(c->Dim(values, 0));
-      c->set_output(0, keys);
-      c->set_output(1, values);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Outputs all keys and values in the table.
-
-table_handle: Handle to the table.
-keys: Vector of all keys present in the table.
-values: Tensor of all values in the table. Indexed in parallel with `keys`.
-)doc");
-
-REGISTER_OP("LookupTableImport")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tin")
-    .Input("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      // TODO: Validate keys and values shape.
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Replaces the contents of the table with the specified keys and values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
-)doc");
-
-REGISTER_OP("HashTable")
-    .Output("table_handle: Ref(string)")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates a non-initialized hash table.
-
-This op creates a hash table, specifying the type of its keys and values.
-Before using the table you will have to initialize it.  After initialization the
-table will be immutable.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
-
-REGISTER_OP("MutableHashTable")
-    .Output("table_handle: Ref(string)")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
-
-REGISTER_OP("MutableHashTableOfTensors")
-    .Output("table_handle: Ref(string)")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .Attr("value_shape: shape = {}")
-    .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a vector. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
-
-REGISTER_OP("MutableDenseHashTable")
-    .Input("empty_key: key_dtype")
-    .Output("table_handle: Ref(string)")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .Attr("value_shape: shape = {}")
-    .Attr("initial_num_buckets: int = 131072")  // 2^17
-    .Attr("max_load_factor: float = 0.8")
-    .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates an empty hash table that uses tensors as the backing store. It uses
-"open addressing" with quadratic reprobing to resolve collisions.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-empty_key: The key used to represent empty key buckets internally. Must not
-  be used in insert or lookup operations.
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-value_shape: The shape of each value.
-initial_num_buckets: The initial number of hash table buckets. Must be a power
-  to 2.
-max_load_factor: The maximum ratio between number of entries and number of
-  buckets before growing the table. Must be between 0 and 1.
-)doc");
-
-REGISTER_OP("InitializeTable")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tkey")
-    .Input("values: Tval")
-    .Attr("Tkey: type")
-    .Attr("Tval: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      ShapeHandle keys;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
-      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Table initializer that takes two tensors for keys and values respectively.
-
-table_handle: Handle to a table which will be initialized.
-keys: Keys of type Tkey.
-values: Values of type Tval.
-)doc");
-
-REGISTER_OP("InitializeTableFromTextFile")
-    .Input("table_handle: Ref(string)")
-    .Input("filename: string")
-    .Attr("key_index: int >= -2")
-    .Attr("value_index: int >= -2")
-    .Attr("vocab_size: int >= -1 = -1")
-    .Attr("delimiter: string = '\t'")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Initializes a table from a text file.
-
-It inserts one key-value pair into the table for each line of the file.
-The key and value is extracted from the whole line content, elements from the
-split line based on `delimiter` or the line number (starting from zero).
-Where to extract the key and value from a line is specified by `key_index` and
-`value_index`.
-
-- A value of -1 means use the line number(starting from zero), expects `int64`.
-- A value of -2 means use the whole line content, expects `string`.
-- A value >= 0 means use the index (starting at zero) of the split line based
-  on `delimiter`.
-
-table_handle: Handle to a table which will be initialized.
-filename: Filename of a vocabulary text file.
-key_index: Column index in a line to get the table `key` values from.
-value_index: Column index that represents information of a line to get the table
-  `value` values from.
-vocab_size: Number of elements of the file, use -1 if unknown.
-delimiter: Delimiter to separate fields in a line.
-)doc");
-
 REGISTER_OP("GetSessionHandle")
     .Input("value: T")
     .Output("handle: string")
@@ -2213,15 +1973,16 @@ REGISTER_OP("Stage")
     .SetShapeFn(shape_inference::UnknownShape)
     .SetIsStateful()
     .Doc(R"doc(
-Stage values similar to a lightweight Enqueue.  The basic functionality of this
-Op is similar to a queue with many fewer capabilities and options.  This Op is
-optimized for performance.
+Stage values similar to a lightweight Enqueue.
+
+The basic functionality of this Op is similar to a queue with many
+fewer capabilities and options.  This Op is optimized for performance.
 
 values: a list of tensors
 container: If non-empty, this queue is placed in the given container. Otherwise,
   a default container is used.
 shared_name: It is necessary to match this name to the matching Unstage Op.
-    )doc");
+)doc");
 
 REGISTER_OP("Unstage")
     .Output("values: dtypes")
@@ -2231,10 +1992,11 @@ REGISTER_OP("Unstage")
     .SetShapeFn(shape_inference::UnknownShape)
     .SetIsStateful()
     .Doc(R"doc(
-Op is similar to a lightweight Dequeue.  The basic funtionality is similar to
-dequeue with many fewer capabilities and options.  This Op is optimized for
-performance.
-    )doc");
+Op is similar to a lightweight Dequeue.
+
+The basic funtionality is similar to dequeue with many fewer
+capabilities and options.  This Op is optimized for performance.
+)doc");
 
 REGISTER_OP("RecordInput")
     .Output("records: string")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 41a3aa0c38bd624e95b04eb4557d5d9767971ea4..bbfdb3475871c3fc690ccd97886cdad7ef44d108 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -349,6 +349,9 @@ The attr `ratio` allows downscaling the image by an integer factor during
 decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
 downscaling the image later.
 
+This op also supports decoding PNGs and non-animated GIFs since the interface is
+the same, though it is cleaner to use `tf.image.decode_image`.
+
 contents: 0-D.  The JPEG-encoded image.
 channels: Number of color channels for the decoded image.
 ratio: Downscaling ratio.
@@ -525,6 +528,9 @@ Accepted values are:
 If needed, the PNG-encoded image is transformed to match the requested number
 of color channels.
 
+This op also supports decoding JPEGs and non-animated GIFs since the interface
+is the same, though it is cleaner to use `tf.image.decode_image`.
+
 contents: 0-D.  The PNG-encoded image.
 channels: Number of color channels for the decoded image.
 image: 3-D with shape `[height, width, channels]`.
@@ -576,7 +582,10 @@ Decode the first frame of a GIF-encoded image to a uint8 tensor.
 GIF with frame or transparency compression are not supported
 convert animated GIF from compressed to uncompressed by:
 
-convert $src.gif -coalesce $dst.gif
+    convert $src.gif -coalesce $dst.gif
+
+This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+`tf.image.decode_image`.
 
 contents: 0-D.  The GIF-encoded image.
 image: 4-D with shape `[num_frames, height, width, 3]`. RGB order
@@ -963,11 +972,50 @@ method: A string specifying the interpolation method. Only 'bilinear' is
 // --------------------------------------------------------------------------
 
 REGISTER_OP("NonMaxSuppression")
+  .Input("boxes: float")
+  .Input("scores: float")
+  .Input("max_output_size: int32")
+  .Output("selected_indices: int32")
+  .Attr("iou_threshold: float = 0.5")
+  .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+  .Doc(R"doc(
+Greedily selects a subset of bounding boxes in descending order of score,
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system.  Note that this
+algorithm is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression(
+      boxes, scores, max_output_size, iou_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+  score corresponding to each box (each row of boxes).
+max_output_size: A scalar integer tensor representing the maximum number of
+  boxes to be selected by non max suppression.
+iou_threshold: A float representing the threshold for deciding whether boxes
+  overlap too much with respect to IOU.
+selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
+  indices from the boxes tensor, where `M <= max_output_size`.
+)doc");
+
+REGISTER_OP("NonMaxSuppressionV2")
     .Input("boxes: float")
     .Input("scores: float")
     .Input("max_output_size: int32")
+    .Input("iou_threshold: float")
     .Output("selected_indices: int32")
-    .Attr("iou_threshold: float = 0.5")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
@@ -989,7 +1037,7 @@ collection of bounding boxes representing the selected boxes.  The bounding
 box coordinates corresponding to the selected indices can then be obtained
 using the `tf.gather operation`.  For example:
 
-  selected_indices = tf.image.non_max_suppression(
+  selected_indices = tf.image.non_max_suppression_v2(
       boxes, scores, max_output_size, iou_threshold)
   selected_boxes = tf.gather(boxes, selected_indices)
 
@@ -998,8 +1046,8 @@ scores: A 1-D float tensor of shape `[num_boxes]` representing a single
   score corresponding to each box (each row of boxes).
 max_output_size: A scalar integer tensor representing the maximum number of
   boxes to be selected by non max suppression.
-iou_threshold: A float representing the threshold for deciding whether boxes
-  overlap too much with respect to IOU.
+iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+  boxes overlap too much with respect to IOU.
 selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
   indices from the boxes tensor, where `M <= max_output_size`.
 )doc");
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dac02dad8bb861fee0e16e0acb0c8e17688e05fb
--- /dev/null
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -0,0 +1,670 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+// --------------------------------------------------------------------------
+
+namespace {
+Status TwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  for (int i = 0; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+  for (int i = 1; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+Status TwoElementOutput(InferenceContext* c) {
+  c->set_output(0, c->Vector(2));
+  return Status::OK();
+}
+
+Status ScalarOutput(InferenceContext* c) {
+  c->set_output(0, c->Scalar());
+  return Status::OK();
+}
+}  // namespace
+
+REGISTER_OP("LookupTableFind")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      // Default value must be scalar or vector.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Looks up keys in a table, outputs the corresponding values.
+
+The tensor `keys` must of the same type as the keys of the table.
+The output `values` is of the type of the table values.
+
+The scalar `default_value` is the value output for keys not present in the
+table. It must also be of the same type as the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Same shape as `keys`.  Values found in the table, or `default_values`
+   for missing keys.
+)doc");
+
+REGISTER_OP("LookupTableFindV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // Default value must be scalar or vector.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Looks up keys in a table, outputs the corresponding values.
+
+The tensor `keys` must of the same type as the keys of the table.
+The output `values` is of the type of the table values.
+
+The scalar `default_value` is the value output for keys not present in the
+table. It must also be of the same type as the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Same shape as `keys`.  Values found in the table, or `default_values`
+   for missing keys.
+)doc");
+
+REGISTER_OP("LookupTableInsert")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      // TODO(ebrevdo): Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Updates the table to associates keys with values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("LookupTableInsertV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Updates the table to associates keys with values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("LookupTableSize")
+    .Input("table_handle: Ref(string)")
+    .Output("size: int64")
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
+    .Doc(R"doc(
+Computes the number of elements in the given table.
+
+table_handle: Handle to the table.
+size: Scalar that contains number of elements in the table.
+)doc");
+
+REGISTER_OP("LookupTableSizeV2")
+    .Input("table_handle: resource")
+    .Output("size: int64")
+    .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs)
+    .Doc(R"doc(
+Computes the number of elements in the given table.
+
+table_handle: Handle to the table.
+size: Scalar that contains number of elements in the table.
+)doc");
+
+REGISTER_OP("LookupTableExport")
+    .Input("table_handle: Ref(string)")
+    .Output("keys: Tkeys")
+    .Output("values: Tvalues")
+    .Attr("Tkeys: type")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      ShapeHandle values = c->UnknownShape();
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
+      ShapeHandle keys = c->Vector(c->Dim(values, 0));
+      c->set_output(0, keys);
+      c->set_output(1, values);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs all keys and values in the table.
+
+table_handle: Handle to the table.
+keys: Vector of all keys present in the table.
+values: Tensor of all values in the table. Indexed in parallel with `keys`.
+)doc");
+
+REGISTER_OP("LookupTableExportV2")
+    .Input("table_handle: resource")
+    .Output("keys: Tkeys")
+    .Output("values: Tvalues")
+    .Attr("Tkeys: type")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle values = c->UnknownShape();
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
+      ShapeHandle keys = c->Vector(c->Dim(values, 0));
+      c->set_output(0, keys);
+      c->set_output(1, values);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs all keys and values in the table.
+
+table_handle: Handle to the table.
+keys: Vector of all keys present in the table.
+values: Tensor of all values in the table. Indexed in parallel with `keys`.
+)doc");
+
+REGISTER_OP("LookupTableImport")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      // TODO(ebrevdo): Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Replaces the contents of the table with the specified keys and values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("LookupTableImportV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Replaces the contents of the table with the specified keys and values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("HashTable")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates a non-initialized hash table.
+
+This op creates a hash table, specifying the type of its keys and values.
+Before using the table you will have to initialize it.  After initialization the
+table will be immutable.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("HashTableV2")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates a non-initialized hash table.
+
+This op creates a hash table, specifying the type of its keys and values.
+Before using the table you will have to initialize it.  After initialization the
+table will be immutable.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTable")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTableV2")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTableOfTensors")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTableOfTensorsV2")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableDenseHashTable")
+    .Input("empty_key: key_dtype")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .Attr("initial_num_buckets: int = 131072")  // 2^17
+    .Attr("max_load_factor: float = 0.8")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates an empty hash table that uses tensors as the backing store.
+
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+empty_key: The key used to represent empty key buckets internally. Must not
+  be used in insert or lookup operations.
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+value_shape: The shape of each value.
+initial_num_buckets: The initial number of hash table buckets. Must be a power
+  to 2.
+max_load_factor: The maximum ratio between number of entries and number of
+  buckets before growing the table. Must be between 0 and 1.
+)doc");
+
+REGISTER_OP("MutableDenseHashTableV2")
+    .Input("empty_key: key_dtype")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .Attr("initial_num_buckets: int = 131072")  // 2^17
+    .Attr("max_load_factor: float = 0.8")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates an empty hash table that uses tensors as the backing store.
+
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+empty_key: The key used to represent empty key buckets internally. Must not
+  be used in insert or lookup operations.
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+value_shape: The shape of each value.
+initial_num_buckets: The initial number of hash table buckets. Must be a power
+  to 2.
+max_load_factor: The maximum ratio between number of entries and number of
+  buckets before growing the table. Must be between 0 and 1.
+)doc");
+
+REGISTER_OP("InitializeTable")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tkey")
+    .Input("values: Tval")
+    .Attr("Tkey: type")
+    .Attr("Tval: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Table initializer that takes two tensors for keys and values respectively.
+
+table_handle: Handle to a table which will be initialized.
+keys: Keys of type Tkey.
+values: Values of type Tval.
+)doc");
+
+REGISTER_OP("InitializeTableV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tkey")
+    .Input("values: Tval")
+    .Attr("Tkey: type")
+    .Attr("Tval: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Table initializer that takes two tensors for keys and values respectively.
+
+table_handle: Handle to a table which will be initialized.
+keys: Keys of type Tkey.
+values: Values of type Tval.
+)doc");
+
+REGISTER_OP("InitializeTableFromTextFile")
+    .Input("table_handle: Ref(string)")
+    .Input("filename: string")
+    .Attr("key_index: int >= -2")
+    .Attr("value_index: int >= -2")
+    .Attr("vocab_size: int >= -1 = -1")
+    .Attr("delimiter: string = '\t'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Initializes a table from a text file.
+
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+
+table_handle: Handle to a table which will be initialized.
+filename: Filename of a vocabulary text file.
+key_index: Column index in a line to get the table `key` values from.
+value_index: Column index that represents information of a line to get the table
+  `value` values from.
+vocab_size: Number of elements of the file, use -1 if unknown.
+delimiter: Delimiter to separate fields in a line.
+)doc");
+
+REGISTER_OP("InitializeTableFromTextFileV2")
+    .Input("table_handle: resource")
+    .Input("filename: string")
+    .Attr("key_index: int >= -2")
+    .Attr("value_index: int >= -2")
+    .Attr("vocab_size: int >= -1 = -1")
+    .Attr("delimiter: string = '\t'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Initializes a table from a text file.
+
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+
+table_handle: Handle to a table which will be initialized.
+filename: Filename of a vocabulary text file.
+key_index: Column index in a line to get the table `key` values from.
+value_index: Column index that represents information of a line to get the table
+  `value` values from.
+vocab_size: Number of elements of the file, use -1 if unknown.
+delimiter: Delimiter to separate fields in a line.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8e762274c942505822a9a2bb97a712324e93aee6..28c4ec643e588acb7068a9184237c24e0f0fd81e 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -595,7 +595,9 @@ REGISTER_OP("Mod")
     .Attr("T: {int32, int64, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
-Returns element-wise remainder of division.
+Returns element-wise remainder of division. This emulates C semantics in that
+the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+y + truncate_mod(x, y) = x`.
 
 *NOTE*: `Mod` supports broadcasting. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
@@ -623,12 +625,11 @@ REGISTER_OP("TruncateMod")
     .Attr("T: {int32, int64, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
-Returns element-wise remainder of division. This emulates C semantics where
+Returns element-wise remainder of division. This emulates C semantics in that
+the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+y + truncate_mod(x, y) = x`.
 
-true, this follows C semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `Mod` supports broadcasting. More about broadcasting
+*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
@@ -662,13 +663,12 @@ Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 
 The upper regularized incomplete Gamma function is defined as:
 
-```
-Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)
-```
+\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+
 where
-```
-Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt
-```
+
+\\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+
 is the upper incomplete Gama function.
 
 Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
@@ -686,13 +686,13 @@ Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 
 The lower regularized incomplete Gamma function is defined as:
 
-```
-P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)
-```
+
+\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+
 where
-```
-gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt
-```
+
+\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+
 is the lower incomplete Gamma function.
 
 Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
@@ -710,9 +710,9 @@ Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 
 The Hurwitz zeta function is defined as:
 
-```
-\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}
-```
+
+\\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+
 )doc");
 
 REGISTER_OP("Polygamma")
@@ -726,9 +726,9 @@ Compute the polygamma function \\(\psi^{(n)}(x)\\).
 
 The polygamma function is defined as:
 
-```
-\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)
-```
+
+\\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+
 where \\(\psi(x)\\) is the digamma function.
 )doc");
 
@@ -790,14 +790,14 @@ Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 
 The regularized incomplete beta integral is defined as:
 
-```
-I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}
-```
+
+\\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+
 where
 
-```
-B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt
-```
+
+\\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+
 
 is the incomplete beta function and \\(B(a, b)\\) is the *complete*
 beta function.
@@ -1271,6 +1271,8 @@ REGISTER_OP("ArgMax")
     .Doc(R"doc(
 Returns the index with the largest value across dimensions of a tensor.
 
+Note that in case of ties the identity of the return value is not guaranteed.
+
 dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
   of the input Tensor to reduce across. For vectors, use dimension = 0.
 )doc");
@@ -1285,6 +1287,8 @@ REGISTER_OP("ArgMin")
     .Doc(R"doc(
 Returns the index with the smallest value across dimensions of a tensor.
 
+Note that in case of ties the identity of the return value is not guaranteed.
+
 dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
   of the input Tensor to reduce across. For vectors, use dimension = 0.
 )doc");
@@ -2371,4 +2375,35 @@ output_max: the computed max output.
 
 )doc");
 
+// --------------------------------------------------------------------------
+
+REGISTER_OP("Bucketize")
+    .Input("input: T")
+    .Output("output: int32")
+    .Attr("T: {int32, int64, float, double}")
+    .Attr("boundaries: list(float)")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Bucketizes 'input' based on 'boundaries'.
+
+For example, if the inputs are
+    boundaries = [0, 10, 100]
+    input = [[-5, 10000]
+             [150,   10]
+             [5,    100]]
+
+then the output will be
+    output = [[0, 3]
+              [3, 2]
+              [1, 3]]
+
+input: Any shape of Tensor contains with int or float type.
+boundaries: A sorted list of floats gives the boundary of the buckets.
+output: Same shape with 'input', each value of input replaced with bucket index.
+
+@compatibility(numpy)
+Equivalent to np.digitize.
+@end_compatibility
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 932113bf2c4521e17d6de4ce9c1e9587cf3ba748..3e58669e30e3e43838a0455c30ed73721735db80 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -89,7 +89,7 @@ REGISTER_OP("AvgPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::AvgPoolShape)
     .Doc(R"doc(
 Performs average pooling on the input.
@@ -117,7 +117,7 @@ REGISTER_OP("AvgPoolGrad")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       // NOTE(mrry): We could in principle work out the shape from the
       // gradients and the attrs, but if we do not know orig_input_shape
@@ -272,7 +272,7 @@ REGISTER_OP("FusedBatchNorm")
     .Output("batch_variance: T")
     .Output("reserve_space_1: T")
     .Output("reserve_space_2: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
@@ -348,7 +348,7 @@ REGISTER_OP("FusedBatchNormGrad")
     .Output("offset_backprop: T")
     .Output("reserve_space_3: T")
     .Output("reserve_space_4: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
@@ -504,7 +504,7 @@ REGISTER_OP("Conv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -557,7 +557,7 @@ REGISTER_OP("Conv2DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -599,7 +599,7 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -735,7 +735,7 @@ REGISTER_OP("FusedResizeAndPadConv2D")
     .Input("paddings: int32")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {float}")
     .Attr("resize_align_corners: bool = false")
     .Attr(GetMirrorPadModeAttrString())
     .Attr("strides: list(int)")
@@ -777,7 +777,7 @@ REGISTER_OP("FusedPadConv2D")
     .Input("paddings: int32")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {float}")
     .Attr(GetMirrorPadModeAttrString())
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
@@ -939,7 +939,7 @@ REGISTER_OP("Conv3D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
@@ -971,7 +971,7 @@ REGISTER_OP("Conv3DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropInputV2")
@@ -997,7 +997,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropFilterV2")
@@ -1026,7 +1026,7 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
@@ -1063,7 +1063,7 @@ REGISTER_OP("Conv3DBackpropFilterV2")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
@@ -1104,7 +1104,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D average pooling on the input.
@@ -1131,7 +1131,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1166,7 +1166,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype")
+    .Attr("T: {float}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D max pooling on the input.
@@ -1190,12 +1190,12 @@ REGISTER_OP("MaxPool3DGrad")
     .Input("orig_output: TInput")
     .Input("grad: T")
     .Output("output: T")
-    .Attr("ksize: list(int) >= 5 ")
+    .Attr("ksize: list(int) >= 5")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype = DT_FLOAT")
-    .Attr("TInput: numbertype = DT_FLOAT")
+    .Attr("T: {float} = DT_FLOAT")
+    .Attr("TInput: {float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     })
@@ -1226,7 +1226,7 @@ REGISTER_OP("MaxPool3DGradGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: realnumbertype")
+    .Attr("T: {float}")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Pool3DShape(c));
       ShapeHandle unused;
@@ -1260,7 +1260,7 @@ data_format: The data format of the input and output data. With the
 REGISTER_OP("L2Loss")
     .Input("t: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 L2 Loss.
@@ -1748,7 +1748,7 @@ backprops: The gradients:
 REGISTER_OP("Elu")
     .Input("features: T")
     .Output("activations: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
@@ -1761,7 +1761,7 @@ REGISTER_OP("EluGrad")
     .Input("gradients: T")
     .Input("outputs: T")
     .Output("backprops: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Computes gradients for the exponential linear (Elu) operation.
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index bd90336e6db9507d02d757c786f665386ba51471..cd43881a463d97799367ac4719c402a38e65ed1d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -15,8 +15,8 @@ op {
       b: false
     }
   }
-  summary: "Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal."
-  description: "Returns nothing but an exception."
+  summary: "Raise a exception to abort the process when called."
+  description: "If exit_without_error is true, the process will exit normally,\notherwise it will exit with a SIGABORT signal.\n\nReturns nothing but an exception."
 }
 op {
   name: "Abs"
@@ -85,8 +85,8 @@ op {
       }
     }
   }
-  summary: "Applies a gradient to a given accumulator. Does not add if local_step is lesser"
-  description: "than the accumulator\'s global_step."
+  summary: "Applies a gradient to a given accumulator."
+  description: "Does not add if local_step is lesser than the accumulator\'s global_step."
 }
 op {
   name: "AccumulatorNumAccumulated"
@@ -116,8 +116,8 @@ op {
     description: "The new global_step value to set."
     type: DT_INT64
   }
-  summary: "Updates the accumulator with a new value for global_step. Logs warning if the"
-  description: "accumulator\'s value is already higher than new_global_step."
+  summary: "Updates the accumulator with a new value for global_step."
+  description: "Logs warning if the accumulator\'s value is already higher than\nnew_global_step."
 }
 op {
   name: "AccumulatorTakeGradient"
@@ -160,8 +160,8 @@ op {
       }
     }
   }
-  summary: "Extracts the average gradient in the given ConditionalAccumulator, provided"
-  description: "that sufficient (i.e., more than num_required) gradients have been accumulated.\nThe op blocks until sufficient gradients have been accumulated.\nIf the accumulator has already aggregated more than num_required gradients, it\nreturns the average of the accumulated gradients.\nAlso automatically increments the recorded global_step in the accumulator by 1,\nand resets the aggregate to 0."
+  summary: "Extracts the average gradient in the given ConditionalAccumulator."
+  description: "The op blocks until sufficient (i.e., more than num_required)\ngradients have been accumulated.  If the accumulator has already\naggregated more than num_required gradients, it returns the average of\nthe accumulated gradients.  Also automatically increments the recorded\nglobal_step in the accumulator by 1, and resets the aggregate to 0."
 }
 op {
   name: "Acos"
@@ -538,7 +538,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to produce per batch."
+    description: "Number of candidates to produce."
     has_minimum: true
     minimum: 1
   }
@@ -565,6 +565,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Any"
@@ -1551,6 +1552,7 @@ op {
     }
   }
   summary: "Returns the index with the largest value across dimensions of a tensor."
+  description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
 op {
   name: "ArgMin"
@@ -1603,6 +1605,7 @@ op {
     }
   }
   summary: "Returns the index with the smallest value across dimensions of a tensor."
+  description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
 op {
   name: "AsString"
@@ -1903,6 +1906,33 @@ op {
   }
   summary: "Computes atan of x element-wise."
 }
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Computes arctangent of `y/x` element-wise, respecting signs of the arguments."
+  description: "This is the angle \\( \\theta \\in [-\\pi, \\pi] \\) such that\n\\[ x = r \\cos(\\theta) \\]\nand\n\\[ y = r \\sin(\\theta) \\]\nwhere \\(r = \\sqrt(x^2 + y^2) \\)."
+}
 op {
   name: "AudioSpectrogram"
   input_arg {
@@ -2066,15 +2096,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -2139,18 +2163,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2219,18 +2231,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2297,15 +2297,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -3247,7 +3241,7 @@ op {
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, height, width, depth]`, where:\n\n      height = height_pad - crop_top - crop_bottom\n      width = width_pad - crop_left - crop_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```prettyprint\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[5], [7]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```"
+    description: "4-D with shape `[batch, height, width, depth]`, where:\n\n      height = height_pad - crop_top - crop_bottom\n      width = width_pad - crop_left - crop_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```\nx = [[[[1], [3]], [[5], [7]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```"
     type_attr: "T"
   }
   attr {
@@ -3290,7 +3284,7 @@ op {
   }
   input_arg {
     name: "crops"
-    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input\n  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is\n  required that\n  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.\n\nThis operation is equivalent to the following steps:\n\n1. Reshape `input` to `reshaped` of shape:\n     [block_shape[0], ..., block_shape[M-1],\n      batch / prod(block_shape),\n      input_shape[1], ..., input_shape[N-1]]\n\n2. Permute dimensions of `reshaped` to produce `permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1], block_shape[0],\n      ...,\n      input_shape[M], block_shape[M-1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\n3. Reshape `permuted` to produce `reshaped_permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0],\n      ...,\n      input_shape[M] * block_shape[M-1],\n\n      input_shape[M+1],\n      ...,\n      input_shape[N-1]]\n\n4. Crop the start and end of dimensions `[1, ..., M]` of\n   `reshaped_permuted` according to `crops` to produce the output of shape:\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],\n      ...,\n      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```prettyprint\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [2, 0]]`:\n\n```prettyprint\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```"
+    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input\n  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is\n  required that\n  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.\n\nThis operation is equivalent to the following steps:\n\n1. Reshape `input` to `reshaped` of shape:\n     [block_shape[0], ..., block_shape[M-1],\n      batch / prod(block_shape),\n      input_shape[1], ..., input_shape[N-1]]\n\n2. Permute dimensions of `reshaped` to produce `permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1], block_shape[0],\n      ...,\n      input_shape[M], block_shape[M-1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\n3. Reshape `permuted` to produce `reshaped_permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0],\n      ...,\n      input_shape[M] * block_shape[M-1],\n\n      input_shape[M+1],\n      ...,\n      input_shape[N-1]]\n\n4. Crop the start and end of dimensions `[1, ..., M]` of\n   `reshaped_permuted` according to `crops` to produce the output of shape:\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],\n      ...,\n      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [2, 0]]`:\n\n```\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```"
     type_attr: "Tcrops"
   }
   output_arg {
@@ -3359,7 +3353,7 @@ op {
     }
   }
   summary: "Compute the regularized incomplete beta integral \\\\(I_x(a, b)\\\\)."
-  description: "The regularized incomplete beta integral is defined as:\n\n```\nI_x(a, b) = \\frac{B(x; a, b)}{B(a, b)}\n```\nwhere\n\n```\nB(x; a, b) = \\int_0^x t^{a-1} (1 - t)^{b-1} dt\n```\n\nis the incomplete beta function and \\\\(B(a, b)\\\\) is the *complete*\nbeta function."
+  description: "The regularized incomplete beta integral is defined as:\n\n\n\\\\(I_x(a, b) = \\frac{B(x; a, b)}{B(a, b)}\\\\)\n\nwhere\n\n\n\\\\(B(x; a, b) = \\int_0^x t^{a-1} (1 - t)^{b-1} dt\\\\)\n\n\nis the incomplete beta function and \\\\(B(a, b)\\\\) is the *complete*\nbeta function."
 }
 op {
   name: "BiasAdd"
@@ -3668,6 +3662,38 @@ op {
   summary: "Return the reduction indices for computing gradients of s0 op s1 with broadcast."
   description: "This is typically used by gradient computations for a broadcasting operation."
 }
+op {
+  name: "Bucketize"
+  input_arg {
+    name: "input"
+    description: "Any shape of Tensor contains with int or float type."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Same shape with \'input\', each value of input replaced with bucket index.\n\n@compatibility(numpy)\nEquivalent to np.digitize.\n@end_compatibility"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+    description: "A sorted list of floats gives the boundary of the buckets."
+  }
+  summary: "Bucketizes \'input\' based on \'boundaries\'."
+  description: "For example, if the inputs are\n    boundaries = [0, 10, 100]\n    input = [[-5, 10000]\n             [150,   10]\n             [5,    100]]\n\nthen the output will be\n    output = [[0, 3]\n              [3, 2]\n              [1, 3]]"
+}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -3819,6 +3845,14 @@ op {
     }
     description: "Scalar.  If set to false, *during* CTC calculation\nrepeated non-blank labels will not be merged and are interpreted as\nindividual labels.  This is a simplified version of CTC."
   }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Scalar. If set to true, during CTC\ncalculation items have longer input sequences than output sequences\nare ignored by returning zero-gradient for those items."
+  }
   summary: "Calculates the CTC Loss (log probability) for each batch entry.  Also calculates"
   description: "the gradient.  This class performs the softmax operation for you, so inputs\nshould be e.g. linear projections of outputs by an LSTM."
 }
@@ -4127,7 +4161,7 @@ op {
   }
   output_arg {
     name: "offset"
-    description: "The `N` int32 vectors representing the starting offset\n        of input tensors within the concatenated output.\n\nThis is typically used by gradient computations for a concat operation."
+    description: "The `N` int32 vectors representing the starting offset\nof input tensors within the concatenated output."
     type: DT_INT32
     number_attr: "N"
   }
@@ -4138,7 +4172,7 @@ op {
     minimum: 2
   }
   summary: "Computes offsets of concat inputs within its output."
-  description: "For example:\n\n```prettyprint\n# \'x\' is [2, 2, 7]\n# \'y\' is [2, 3, 7]\n# \'z\' is [2, 5, 7]\nconcat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]\n```"
+  description: "For example:\n\n```\n# \'x\' is [2, 2, 7]\n# \'y\' is [2, 3, 7]\n# \'z\' is [2, 5, 7]\nconcat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]\n```\n\nThis is typically used by gradient computations for a concat operation."
 }
 op {
   name: "ConcatV2"
@@ -4233,10 +4267,10 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this accumulator will be shared under the given name\nacross multiple sessions."
+    description: "If non-empty, this accumulator will be shared under the\ngiven name across multiple sessions."
   }
-  summary: "A conditional accumulator for aggregating gradients. The accumulator accepts"
-  description: "gradients marked with local_step greater or equal to the most recent global_step\nknown to the accumulator. The average can be extracted from the accumulator,\nprovided sufficient gradients have been accumulated. Extracting the average\nautomatically resets the aggregate to 0, and increments the global_step recorded\nby the accumulator."
+  summary: "A conditional accumulator for aggregating gradients."
+  description: "The accumulator accepts gradients marked with local_step greater or\nequal to the most recent global_step known to the accumulator. The\naverage can be extracted from the accumulator, provided sufficient\ngradients have been accumulated. Extracting the average automatically\nresets the aggregate to 0, and increments the global_step recorded by\nthe accumulator."
   is_stateful: true
 }
 op {
@@ -4311,7 +4345,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4384,7 +4417,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4456,7 +4488,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4522,18 +4553,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4600,18 +4619,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4667,18 +4674,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4744,18 +4739,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4811,18 +4794,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -5565,7 +5536,7 @@ op {
     type: DT_UINT8
   }
   summary: "Decode the first frame of a GIF-encoded image to a uint8 tensor."
-  description: "GIF with frame or transparency compression are not supported\nconvert animated GIF from compressed to uncompressed by:\n\nconvert $src.gif -coalesce $dst.gif"
+  description: "GIF with frame or transparency compression are not supported\nconvert animated GIF from compressed to uncompressed by:\n\n    convert $src.gif -coalesce $dst.gif\n\nThis op also supports decoding JPEGs and PNGs, though it is cleaner to use\n`tf.image.decode_image`."
 }
 op {
   name: "DecodeJSONExample"
@@ -5643,7 +5614,7 @@ op {
     description: "string specifying a hint about the algorithm used for\ndecompression.  Defaults to \"\" which maps to a system-specific\ndefault.  Currently valid values are [\"INTEGER_FAST\",\n\"INTEGER_ACCURATE\"].  The hint may be ignored (e.g., the internal\njpeg library changes to a version that does not have that specific\noption.)"
   }
   summary: "Decode a JPEG-encoded image to a uint8 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the JPEG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n\nIf needed, the JPEG-encoded image is transformed to match the requested number\nof color channels.\n\nThe attr `ratio` allows downscaling the image by an integer factor during\ndecoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than\ndownscaling the image later."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the JPEG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n\nIf needed, the JPEG-encoded image is transformed to match the requested number\nof color channels.\n\nThe attr `ratio` allows downscaling the image by an integer factor during\ndecoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than\ndownscaling the image later.\n\nThis op also supports decoding PNGs and non-animated GIFs since the interface is\nthe same, though it is cleaner to use `tf.image.decode_image`."
 }
 op {
   name: "DecodePng"
@@ -5679,7 +5650,7 @@ op {
     }
   }
   summary: "Decode a PNG-encoded image to a uint8 or uint16 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels.\n\nThis op also supports decoding JPEGs and non-animated GIFs since the interface\nis the same, though it is cleaner to use `tf.image.decode_image`."
 }
 op {
   name: "DecodeRaw"
@@ -5908,7 +5879,7 @@ op {
     minimum: 2
   }
   summary: "DepthToSpace for tensors of type T."
-  description: "Rearranges data from depth into blocks of spatial data.\nThis is the reverse transformation of SpaceToDepth. More specifically,\nthis op outputs a copy of the input tensor where values from the `depth`\ndimension are moved in spatial blocks to the `height` and `width` dimensions.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Chunks of data of size `block_size * block_size` from depth are rearranged\n    into non-overlapping blocks of size `block_size x block_size`\n  * The width the output tensor is `input_depth * block_size`, whereas the\n    height is `input_height * block_size`.\n  * The depth of the input tensor must be divisible by\n    `block_size * block_size`.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height*block_size, width*block_size, depth/(block_size*block_size)]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and that `block_size * block_size` be a divisor of the\ninput depth.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:\n\n```prettyprint\nx = [[[[1, 2, 3, 4]]]]\n\n```\n\nThis operation will output a tensor of shape `[1, 2, 2, 1]`:\n\n```prettyprint\n   [[[[1], [2]],\n     [[3], [4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,\nthe corresponding output will have 2x2 elements and will have a depth of\n1 channel (1 = `4 / (block_size * block_size)`).\nThe output element shape is `[2, 2, 1]`.\n\nFor an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.\n\n```prettyprint\nx = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nThis operation, for block size of 2, will return the following tensor of shape\n`[1, 2, 2, 3]`\n\n```prettyprint\n   [[[[1, 2, 3], [4, 5, 6]],\n     [[7, 8, 9], [10, 11, 12]]]]\n\n```\n\nSimilarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:\n\n```prettyprint\nx =  [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 4 4 1]`:\n\n```prettyprint\nx = [[ [1],   [2],  [5],  [6]],\n     [ [3],   [4],  [7],  [8]],\n     [ [9],  [10], [13],  [14]],\n     [ [11], [12], [15],  [16]]]\n\n```"
+  description: "Rearranges data from depth into blocks of spatial data.\nThis is the reverse transformation of SpaceToDepth. More specifically,\nthis op outputs a copy of the input tensor where values from the `depth`\ndimension are moved in spatial blocks to the `height` and `width` dimensions.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Chunks of data of size `block_size * block_size` from depth are rearranged\n    into non-overlapping blocks of size `block_size x block_size`\n  * The width the output tensor is `input_depth * block_size`, whereas the\n    height is `input_height * block_size`.\n  * The depth of the input tensor must be divisible by\n    `block_size * block_size`.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height*block_size, width*block_size, depth/(block_size*block_size)]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and that `block_size * block_size` be a divisor of the\ninput depth.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:\n\n```\nx = [[[[1, 2, 3, 4]]]]\n\n```\n\nThis operation will output a tensor of shape `[1, 2, 2, 1]`:\n\n```\n   [[[[1], [2]],\n     [[3], [4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,\nthe corresponding output will have 2x2 elements and will have a depth of\n1 channel (1 = `4 / (block_size * block_size)`).\nThe output element shape is `[2, 2, 1]`.\n\nFor an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.\n\n```\nx = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nThis operation, for block size of 2, will return the following tensor of shape\n`[1, 2, 2, 3]`\n\n```\n   [[[[1, 2, 3], [4, 5, 6]],\n     [[7, 8, 9], [10, 11, 12]]]]\n\n```\n\nSimilarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:\n\n```\nx =  [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 4 4 1]`:\n\n```\nx = [[ [1],   [2],  [5],  [6]],\n     [ [3],   [4],  [7],  [8]],\n     [ [9],  [10], [13],  [14]],\n     [ [11], [12], [15],  [16]]]\n\n```"
 }
 op {
   name: "DepthwiseConv2dNative"
@@ -6221,7 +6192,7 @@ op {
     }
   }
   summary: "Returns a diagonal tensor with a given diagonal values."
-  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of\nrank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:\n\n`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.\n\nFor example:\n\n```prettyprint\n# \'diagonal\' is [1, 2, 3, 4]\ntf.diag(diagonal) ==> [[1, 0, 0, 0]\n                       [0, 2, 0, 0]\n                       [0, 0, 3, 0]\n                       [0, 0, 0, 4]]\n```"
+  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of\nrank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:\n\n`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.\n\nFor example:\n\n```\n# \'diagonal\' is [1, 2, 3, 4]\ntf.diag(diagonal) ==> [[1, 0, 0, 0]\n                       [0, 2, 0, 0]\n                       [0, 0, 3, 0]\n                       [0, 0, 0, 4]]\n```"
 }
 op {
   name: "DiagPart"
@@ -6250,7 +6221,7 @@ op {
     }
   }
   summary: "Returns the diagonal part of the tensor."
-  description: "This operation returns a tensor with the `diagonal` part\nof the `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a\ntensor of rank `k` with dimensions `[D1,..., Dk]` where:\n\n`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.\n\nFor example:\n\n```prettyprint\n# \'input\' is [[1, 0, 0, 0]\n              [0, 2, 0, 0]\n              [0, 0, 3, 0]\n              [0, 0, 0, 4]]\n\ntf.diag_part(input) ==> [1, 2, 3, 4]\n```"
+  description: "This operation returns a tensor with the `diagonal` part\nof the `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a\ntensor of rank `k` with dimensions `[D1,..., Dk]` where:\n\n`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.\n\nFor example:\n\n```\n# \'input\' is [[1, 0, 0, 0]\n              [0, 2, 0, 0]\n              [0, 0, 3, 0]\n              [0, 0, 0, 4]]\n\ntf.diag_part(input) ==> [1, 2, 3, 4]\n```"
 }
 op {
   name: "Digamma"
@@ -6567,7 +6538,7 @@ op {
     type: "type"
   }
   summary: "Partitions `data` into `num_partitions` tensors using indices from `partitions`."
-  description: "For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`\nbecomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`\nare placed in `outputs[i]` in lexicographic order of `js`, and the first\ndimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.\nIn detail,\n\n```python\n    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]\n\n    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])\n```\n\n`data.shape` must start with `partitions.shape`.\n\nFor example:\n\n```python\n    # Scalar partitions.\n    partitions = 1\n    num_partitions = 2\n    data = [10, 20]\n    outputs[0] = []  # Empty with shape [0, 2]\n    outputs[1] = [[10, 20]]\n\n    # Vector partitions.\n    partitions = [0, 0, 1, 1, 0]\n    num_partitions = 2\n    data = [10, 20, 30, 40, 50]\n    outputs[0] = [10, 20, 50]\n    outputs[1] = [30, 40]\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/DynamicPartition.png\" alt>\n</div>"
+  description: "For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`\nbecomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`\nare placed in `outputs[i]` in lexicographic order of `js`, and the first\ndimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.\nIn detail,\n\n```python\n    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]\n\n    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])\n```\n\n`data.shape` must start with `partitions.shape`.\n\nFor example:\n\n```python\n    # Scalar partitions.\n    partitions = 1\n    num_partitions = 2\n    data = [10, 20]\n    outputs[0] = []  # Empty with shape [0, 2]\n    outputs[1] = [[10, 20]]\n\n    # Vector partitions.\n    partitions = [0, 0, 1, 1, 0]\n    num_partitions = 2\n    data = [10, 20, 30, 40, 50]\n    outputs[0] = [10, 20, 50]\n    outputs[1] = [30, 40]\n```\n\nSee `dynamic_stitch` for an example on how to merge partitions back.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicPartition.png\" alt>\n</div>"
 }
 op {
   name: "DynamicStitch"
@@ -6596,7 +6567,7 @@ op {
     type: "type"
   }
   summary: "Interleave the values from the `data` tensors into a single tensor."
-  description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues are merged in order, so if an index appears in both `indices[m][i]` and\n`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the\nmerged result.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/DynamicStitch.png\" alt>\n</div>"
+  description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues are merged in order, so if an index appears in both `indices[m][i]` and\n`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the\nmerged result.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\nThis method can be used to merge partitions created by `dynamic_partition`\nas illustrated on the following example:\n\n```python\n    # Apply function (increments x_i) on elements for which a certain condition\n    # apply (x_i != -1 in this example).\n    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])\n    condition_mask=tf.not_equal(x,tf.constant(-1.))\n    partitioned_data = tf.dynamic_partition(\n        x, tf.cast(condition_mask, tf.int32) , 2)\n    partitioned_data[1] = partitioned_data[1] + 1.0\n    condition_indices = tf.dynamic_partition(\n        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)\n    x = tf.dynamic_stitch(condition_indices, partitioned_data)\n    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain\n    # unchanged.\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicStitch.png\" alt>\n</div>"
 }
 op {
   name: "EditDistance"
@@ -6665,15 +6636,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -6702,15 +6667,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -7100,7 +7059,7 @@ op {
     }
   }
   summary: "Inserts a dimension of 1 into a tensor\'s shape."
-  description: "Given a tensor `input`, this operation inserts a dimension of 1 at the\ndimension index `dim` of `input`\'s shape. The dimension index `dim` starts at\nzero; if you specify a negative number for `dim` it is counted backward from\nthe end.\n\nThis operation is useful if you want to add a batch dimension to a single\nelement. For example, if you have a single image of shape `[height, width,\nchannels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,\nwhich will make the shape `[1, height, width, channels]`.\n\nOther examples:\n\n```prettyprint\n# \'t\' is a tensor of shape [2]\nshape(expand_dims(t, 0)) ==> [1, 2]\nshape(expand_dims(t, 1)) ==> [2, 1]\nshape(expand_dims(t, -1)) ==> [2, 1]\n\n# \'t2\' is a tensor of shape [2, 3, 5]\nshape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]\nshape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]\nshape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]\n```\n\nThis operation requires that:\n\n`-1-input.dims() <= dim <= input.dims()`\n\nThis operation is related to `squeeze()`, which removes dimensions of\nsize 1."
+  description: "Given a tensor `input`, this operation inserts a dimension of 1 at the\ndimension index `dim` of `input`\'s shape. The dimension index `dim` starts at\nzero; if you specify a negative number for `dim` it is counted backward from\nthe end.\n\nThis operation is useful if you want to add a batch dimension to a single\nelement. For example, if you have a single image of shape `[height, width,\nchannels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,\nwhich will make the shape `[1, height, width, channels]`.\n\nOther examples:\n\n```\n# \'t\' is a tensor of shape [2]\nshape(expand_dims(t, 0)) ==> [1, 2]\nshape(expand_dims(t, 1)) ==> [2, 1]\nshape(expand_dims(t, -1)) ==> [2, 1]\n\n# \'t2\' is a tensor of shape [2, 3, 5]\nshape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]\nshape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]\nshape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]\n```\n\nThis operation requires that:\n\n`-1-input.dims() <= dim <= input.dims()`\n\nThis operation is related to `squeeze()`, which removes dimensions of\nsize 1."
 }
 op {
   name: "Expm1"
@@ -7420,8 +7379,15 @@ op {
       f: 6
     }
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
   summary: "Fake-quantize the \'inputs\' tensor, type float to \'outputs\' tensor of same type."
-  description: "Attributes [min; max] define the clamping range for the \'inputs\' data.  Op\ndivides this range into 255 steps (total of 256 values), then replaces each\n\'inputs\' value with the closest of the quantized step values.\n\nQuantization is called fake since the output is still in floating point."
+  description: "Attributes [min; max] define the clamping range for the \'inputs\' data.  Op\ndivides this range into 255 steps (total of 256 values), then replaces each\n\'inputs\' value with the closest of the quantized step values.\n\'num_bits\' is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nQuantization is called fake since the output is still in floating point."
 }
 op {
   name: "FakeQuantWithMinMaxArgsGradient"
@@ -7454,6 +7420,13 @@ op {
       f: 6
     }
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
   summary: "Compute gradients for a FakeQuantWithMinMaxArgs operation."
 }
 op {
@@ -7474,8 +7447,15 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
   summary: "Fake-quantize the \'inputs\' tensor of type float via global float scalars `min`"
-  description: "and `max` to \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data.  Op divides this range\ninto 255 steps (total of 256 values), then replaces each \'inputs\' value with the\nclosest of the quantized step values.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
+  description: "and `max` to \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data.  Op divides this range\ninto 255 steps (total of 256 values), then replaces each \'inputs\' value with the\nclosest of the quantized step values.\n\'num_bits\' is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
 }
 op {
   name: "FakeQuantWithMinMaxVarsGradient"
@@ -7512,6 +7492,14 @@ op {
     description: "Backpropagated gradients w.r.t. max parameter:\n`sum(gradients * (inputs > max))`."
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+    description: "The bitwidth of the quantization; between 2 and 8, inclusive."
+  }
   summary: "Compute gradients for a FakeQuantWithMinMaxVars operation."
 }
 op {
@@ -7532,8 +7520,15 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
   summary: "Fake-quantize the \'inputs\' tensor of type float and one of the shapes: `[d]`,"
-  description: "`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`\nto \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data in the corresponding\ndepth channel.  Op divides this range into 255 steps (total of 256 values), then\nreplaces each \'inputs\' value with the closest of the quantized step values.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
+  description: "`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`\nto \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data in the corresponding\ndepth channel.  Op divides this range into 255 steps (total of 256 values), then\nreplaces each \'inputs\' value with the closest of the quantized step values.\n\'num_bits\' is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
 }
 op {
   name: "FakeQuantWithMinMaxVarsPerChannelGradient"
@@ -7570,6 +7565,14 @@ op {
     description: "Backpropagated gradients w.r.t. max parameter, shape `[d]`:\n`sum_per_d(gradients * (inputs > max))`."
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+    description: "The bitwidth of the quantization; between 2 and 8, inclusive."
+  }
   summary: "Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation."
 }
 op {
@@ -7607,7 +7610,7 @@ op {
     type: "type"
   }
   summary: "Creates a tensor filled with a scalar value."
-  description: "This operation creates a tensor of shape `dims` and fills it with `value`.\n\nFor example:\n\n```prettyprint\n# Output tensor has shape [2, 3].\nfill([2, 3], 9) ==> [[9, 9, 9]\n                     [9, 9, 9]]\n```"
+  description: "This operation creates a tensor of shape `dims` and fills it with `value`.\n\nFor example:\n\n```\n# Output tensor has shape [2, 3].\nfill([2, 3], 9) ==> [[9, 9, 9]\n                     [9, 9, 9]]\n```"
 }
 op {
   name: "FixedLengthRecordReader"
@@ -7752,7 +7755,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -7838,6 +7841,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "A unigram sampler could use a fixed unigram distribution read from a\nfile or passed in as an in-memory array instead of building up the distribution\nfrom data on the fly. There is also an option to skew the distribution by\napplying a distortion power to the weights.\n\nThe vocabulary file should be in CSV-like format, with the last field\nbeing the weight associated with the word.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Floor"
@@ -8258,19 +8262,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -8360,19 +8351,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -8429,9 +8407,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -8495,9 +8471,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -8574,23 +8548,23 @@ op {
     }
   }
   summary: "Gather slices from `params` according to `indices`."
-  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `indices.shape + params.shape[1:]` where:\n\n```python\n    # Scalar indices\n    output[:, ..., :] = params[indices, :, ... :]\n\n    # Vector indices\n    output[i, :, ..., :] = params[indices[i], :, ... :]\n\n    # Higher rank indices\n    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]\n```\n\nIf `indices` is a permutation and `len(indices) == params.shape[0]` then\nthis operation will permute `params` accordingly.\n\n`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in\n`indices` are always validated to be within range. If assigned to GPU,\nout-of-bound indices result in safe but unspecified behavior, which may include\nraising an error.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../../images/Gather.png\" alt>\n</div>"
+  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `indices.shape + params.shape[1:]` where:\n\n```python\n    # Scalar indices\n    output[:, ..., :] = params[indices, :, ... :]\n\n    # Vector indices\n    output[i, :, ..., :] = params[indices[i], :, ... :]\n\n    # Higher rank indices\n    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]\n```\n\nIf `indices` is a permutation and `len(indices) == params.shape[0]` then\nthis operation will permute `params` accordingly.\n\n`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in\n`indices` are always validated to be within range. If assigned to GPU,\nout-of-bound indices result in safe but unspecified behavior, which may include\nraising an error.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/Gather.png\" alt>\n</div>"
 }
 op {
   name: "GatherNd"
   input_arg {
     name: "params"
-    description: "`P-D`.  The tensor from which to gather values."
+    description: "The tensor from which to gather values."
     type_attr: "Tparams"
   }
   input_arg {
     name: "indices"
-    description: "`Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`."
+    description: "Index tensor."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "`(P+Q-K-1)-D`.  Values from `params` gathered from indices given by\n`indices`."
+    description: "Values from `params` gathered from indices given by `indices`, with\nshape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`."
     type_attr: "Tparams"
   }
   attr {
@@ -8608,7 +8582,7 @@ op {
     }
   }
   summary: "Gather values or slices from `params` according to `indices`."
-  description: "`params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `params`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `params`.\n\nProduces an output tensor with shape\n\n```\n[d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].\n```\n\nSome examples below.\n\nSimple indexing into a matrix:\n\n```python\n    indices = [[0, 0], [1, 1]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [\'a\', \'d\']\n```\n\nSlice indexing into a matrix:\n\n```python\n    indices = [[1], [0]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'c\', \'d\'], [\'a\', \'b\']]\n```\n\nIndexing into a 3-tensor:\n\n```python\n    indices = [[1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[0, 1], [1, 0]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'c0\', \'d0\'], [\'a1\', \'b1\']]\n\n\n    indices = [[0, 0, 1], [1, 0, 1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [\'b0\', \'b1\']\n```\n\nBatched indexing into a matrix:\n\n```python\n    indices = [[[0, 0]], [[0, 1]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'a\'], [\'b\']]\n```\n\nBatched slice indexing into a matrix:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[[\'c\', \'d\']], [[\'a\', \'b\']]]\n```\n\nBatched indexing into a 3-tensor:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[[\'a1\', \'b1\'], [\'c1\', \'d1\']]],\n              [[[\'a0\', \'b0\'], [\'c0\', \'d0\']]]]\n\n    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'c0\', \'d0\'], [\'a1\', \'b1\']],\n              [[\'a0\', \'b0\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'b0\', \'b1\'], [\'d0\', \'c1\']]\n```"
+  description: "`indices` is an integer tensor containing indices into `params`.  The last\ndimension of `indices` can be at most the rank of `params`:\n\n    indices.shape[-1] <= params.rank\n\nThe last dimension of `indices` corresponds to elements\n(if `indices.shape[-1] = params.rank`) or slices\n(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`\nof `params`.  The output tensor has shape\n\n    indices.shape[:-1] + params.shape[indices.shape[-1]:]\n\nSome examples below.\n\nSimple indexing into a matrix:\n\n```python\n    indices = [[0, 0], [1, 1]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [\'a\', \'d\']\n```\n\nSlice indexing into a matrix:\n\n```python\n    indices = [[1], [0]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'c\', \'d\'], [\'a\', \'b\']]\n```\n\nIndexing into a 3-tensor:\n\n```python\n    indices = [[1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[0, 1], [1, 0]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'c0\', \'d0\'], [\'a1\', \'b1\']]\n\n\n    indices = [[0, 0, 1], [1, 0, 1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [\'b0\', \'b1\']\n```\n\nBatched indexing into a matrix:\n\n```python\n    indices = [[[0, 0]], [[0, 1]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'a\'], [\'b\']]\n```\n\nBatched slice indexing into a matrix:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[[\'c\', \'d\']], [[\'a\', \'b\']]]\n```\n\nBatched indexing into a 3-tensor:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[[\'a1\', \'b1\'], [\'c1\', \'d1\']]],\n              [[[\'a0\', \'b0\'], [\'c0\', \'d0\']]]]\n\n    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'c0\', \'d0\'], [\'a1\', \'b1\']],\n              [[\'a0\', \'b0\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'b0\', \'b1\'], [\'d0\', \'c1\']]\n```"
 }
 op {
   name: "GetSessionHandle"
@@ -8809,13 +8783,58 @@ op {
   is_stateful: true
 }
 op {
-  name: "HistogramSummary"
-  input_arg {
-    name: "tag"
-    description: "Scalar.  Tag to use for the `Summary.Value`."
-    type: DT_STRING
-  }
-  input_arg {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true and shared_name is empty, the table is shared\nusing the node name."
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  summary: "Creates a non-initialized hash table."
+  description: "This op creates a hash table, specifying the type of its keys and values.\nBefore using the table you will have to initialize it.  After initialization the\ntable will be immutable."
+  is_stateful: true
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    description: "Scalar.  Tag to use for the `Summary.Value`."
+    type: DT_STRING
+  }
+  input_arg {
     name: "values"
     description: "Any shape. Values to use to build the histogram."
     type_attr: "T"
@@ -9049,7 +9068,7 @@ op {
     }
   }
   summary: "Compute the lower regularized incomplete Gamma function `Q(a, x)`."
-  description: "The lower regularized incomplete Gamma function is defined as:\n\n```\nP(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\n```\nwhere\n```\ngamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\n```\nis the lower incomplete Gamma function.\n\nNote, above `Q(a, x)` (`Igammac`) is the upper regularized complete\nGamma function."
+  description: "The lower regularized incomplete Gamma function is defined as:\n\n\n\\\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\\\)\n\nwhere\n\n\\\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\\\)\n\nis the lower incomplete Gamma function.\n\nNote, above `Q(a, x)` (`Igammac`) is the upper regularized complete\nGamma function."
 }
 op {
   name: "Igammac"
@@ -9076,7 +9095,7 @@ op {
     }
   }
   summary: "Compute the upper regularized incomplete Gamma function `Q(a, x)`."
-  description: "The upper regularized incomplete Gamma function is defined as:\n\n```\nQ(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\n```\nwhere\n```\nGamma(a, x) = int_{x}^{\\infty} t^{a-1} exp(-t) dt\n```\nis the upper incomplete Gama function.\n\nNote, above `P(a, x)` (`Igamma`) is the lower regularized complete\nGamma function."
+  description: "The upper regularized incomplete Gamma function is defined as:\n\n\\\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\\\)\n\nwhere\n\n\\\\(Gamma(a, x) = int_{x}^{\\infty} t^{a-1} exp(-t) dt\\\\)\n\nis the upper incomplete Gama function.\n\nNote, above `P(a, x)` (`Igamma`) is the lower regularized complete\nGamma function."
 }
 op {
   name: "Imag"
@@ -9318,6 +9337,82 @@ op {
   summary: "Initializes a table from a text file."
   description: "It inserts one key-value pair into the table for each line of the file.\nThe key and value is extracted from the whole line content, elements from the\nsplit line based on `delimiter` or the line number (starting from zero).\nWhere to extract the key and value from a line is specified by `key_index` and\n`value_index`.\n\n- A value of -1 means use the line number(starting from zero), expects `int64`.\n- A value of -2 means use the whole line content, expects `string`.\n- A value >= 0 means use the index (starting at zero) of the split line based\n  on `delimiter`."
 }
+op {
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to a table which will be initialized."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    description: "Filename of a vocabulary text file."
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    description: "Column index in a line to get the table `key` values from."
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    description: "Column index that represents information of a line to get the table\n`value` values from."
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    description: "Number of elements of the file, use -1 if unknown."
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+    description: "Delimiter to separate fields in a line."
+  }
+  summary: "Initializes a table from a text file."
+  description: "It inserts one key-value pair into the table for each line of the file.\nThe key and value is extracted from the whole line content, elements from the\nsplit line based on `delimiter` or the line number (starting from zero).\nWhere to extract the key and value from a line is specified by `key_index` and\n`value_index`.\n\n- A value of -1 means use the line number(starting from zero), expects `int64`.\n- A value of -2 means use the whole line content, expects `string`.\n- A value >= 0 means use the index (starting at zero) of the split line based\n  on `delimiter`."
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to a table which will be initialized."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Keys of type Tkey."
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    description: "Values of type Tval."
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  summary: "Table initializer that takes two tensors for keys and values respectively."
+  is_stateful: true
+}
 op {
   name: "Inv"
   input_arg {
@@ -9410,7 +9505,7 @@ op {
     }
   }
   summary: "Computes the inverse permutation of a tensor."
-  description: "This operation computes the inverse of an index permutation. It takes a 1-D\ninteger tensor `x`, which represents the indices of a zero-based array, and\nswaps each value with its index position. In other words, for an output tensor\n`y` and an input tensor `x`, this operation computes the following:\n\n`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`\n\nThe values must include 0. There can be no duplicate values or negative values.\n\nFor example:\n\n```prettyprint\n# tensor `x` is [3, 4, 0, 2, 1]\ninvert_permutation(x) ==> [2, 4, 3, 0, 1]\n```"
+  description: "This operation computes the inverse of an index permutation. It takes a 1-D\ninteger tensor `x`, which represents the indices of a zero-based array, and\nswaps each value with its index position. In other words, for an output tensor\n`y` and an input tensor `x`, this operation computes the following:\n\n`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`\n\nThe values must include 0. There can be no duplicate values or negative values.\n\nFor example:\n\n```\n# tensor `x` is [3, 4, 0, 2, 1]\ninvert_permutation(x) ==> [2, 4, 3, 0, 1]\n```"
 }
 op {
   name: "IsFinite"
@@ -9522,20 +9617,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -9702,7 +9786,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -9736,6 +9820,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Less"
@@ -9916,7 +10001,7 @@ op {
     }
   }
   summary: "Computes the difference between two lists of numbers or strings."
-  description: "Given a list `x` and a list `y`, this operation returns a list `out` that\nrepresents all values that are in `x` but not in `y`. The returned list `out`\nis sorted in the same order that the numbers appear in `x` (duplicates are\npreserved). This operation also returns a list `idx` that represents the\nposition of each `out` element in `x`. In other words:\n\n`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`\n\nFor example, given this input:\n\n```prettyprint\nx = [1, 2, 3, 4, 5, 6]\ny = [1, 3, 5]\n```\n\nThis operation would return:\n\n```prettyprint\nout ==> [2, 4, 6]\nidx ==> [1, 3, 5]\n```"
+  description: "Given a list `x` and a list `y`, this operation returns a list `out` that\nrepresents all values that are in `x` but not in `y`. The returned list `out`\nis sorted in the same order that the numbers appear in `x` (duplicates are\npreserved). This operation also returns a list `idx` that represents the\nposition of each `out` element in `x`. In other words:\n\n`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`\n\nFor example, given this input:\n\n```\nx = [1, 2, 3, 4, 5, 6]\ny = [1, 3, 5]\n```\n\nThis operation would return:\n\n```\nout ==> [2, 4, 6]\nidx ==> [1, 3, 5]\n```"
 }
 op {
   name: "Log"
@@ -10028,7 +10113,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -10062,6 +10147,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a log-uniform distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "LogicalAnd"
@@ -10139,6 +10225,34 @@ op {
   }
   summary: "Outputs all keys and values in the table."
 }
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    description: "Vector of all keys present in the table."
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    description: "Tensor of all values in the table. Indexed in parallel with `keys`."
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  summary: "Outputs all keys and values in the table."
+  is_stateful: true
+}
 op {
   name: "LookupTableFind"
   input_arg {
@@ -10172,6 +10286,39 @@ op {
   summary: "Looks up keys in a table, outputs the corresponding values."
   description: "The tensor `keys` must of the same type as the keys of the table.\nThe output `values` is of the type of the table values.\n\nThe scalar `default_value` is the value output for keys not present in the\ntable. It must also be of the same type as the table values."
 }
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    description: "Same shape as `keys`.  Values found in the table, or `default_values`\nfor missing keys."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  summary: "Looks up keys in a table, outputs the corresponding values."
+  description: "The tensor `keys` must of the same type as the keys of the table.\nThe output `values` is of the type of the table values.\n\nThe scalar `default_value` is the value output for keys not present in the\ntable. It must also be of the same type as the table values."
+  is_stateful: true
+}
 op {
   name: "LookupTableImport"
   input_arg {
@@ -10201,6 +10348,35 @@ op {
   summary: "Replaces the contents of the table with the specified keys and values."
   description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
 }
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    description: "Values to associate with keys."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  summary: "Replaces the contents of the table with the specified keys and values."
+  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
+  is_stateful: true
+}
 op {
   name: "LookupTableInsert"
   input_arg {
@@ -10230,6 +10406,35 @@ op {
   summary: "Updates the table to associates keys with values."
   description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
 }
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    description: "Values to associate with keys."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  summary: "Updates the table to associates keys with values."
+  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
+  is_stateful: true
+}
 op {
   name: "LookupTableSize"
   input_arg {
@@ -10245,6 +10450,21 @@ op {
   }
   summary: "Computes the number of elements in the given table."
 }
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    description: "Scalar that contains number of elements in the table."
+    type: DT_INT64
+  }
+  summary: "Computes the number of elements in the given table."
+  is_stateful: true
+}
 op {
   name: "LoopCond"
   input_arg {
@@ -10349,7 +10569,7 @@ op {
     type: "type"
   }
   summary: "Copy a tensor setting everything outside a central band in each innermost matrix"
-  description: "to zero.\n\nThe `band` part is computed as follows:\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor with the same shape where\n\n`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.\n\nThe indicator function\n\n`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&\n                 (num_upper < 0 || (n-m) <= num_upper)`.\n\nFor example:\n\n```prettyprint\n# if \'input\' is [[ 0,  1,  2, 3]\n                 [-1,  0,  1, 2]\n                 [-2, -1,  0, 1]\n                 [-3, -2, -1, 0]],\n\ntf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]\n                                       [-1,  0,  1, 2]\n                                       [ 0, -1,  0, 1]\n                                       [ 0,  0, -1, 0]],\n\ntf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]\n                                      [-1,  0,  1, 0]\n                                      [-2, -1,  0, 1]\n                                      [ 0, -2, -1, 0]]\n```\n\nUseful special cases:\n\n```prettyprint\n tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.\n tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.\n tf.matrix_band_part(input, 0, 0) ==> Diagonal.\n```"
+  description: "to zero.\n\nThe `band` part is computed as follows:\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor with the same shape where\n\n`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.\n\nThe indicator function\n\n`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&\n                 (num_upper < 0 || (n-m) <= num_upper)`.\n\nFor example:\n\n```\n# if \'input\' is [[ 0,  1,  2, 3]\n                 [-1,  0,  1, 2]\n                 [-2, -1,  0, 1]\n                 [-3, -2, -1, 0]],\n\ntf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]\n                                       [-1,  0,  1, 2]\n                                       [ 0, -1,  0, 1]\n                                       [ 0,  0, -1, 0]],\n\ntf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]\n                                      [-1,  0,  1, 0]\n                                      [-2, -1,  0, 1]\n                                      [ 0, -2, -1, 0]]\n```\n\nUseful special cases:\n\n```\n tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.\n tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.\n tf.matrix_band_part(input, 0, 0) ==> Diagonal.\n```"
 }
 op {
   name: "MatrixDeterminant"
@@ -10393,7 +10613,7 @@ op {
     type: "type"
   }
   summary: "Returns a batched diagonal tensor with a given batched diagonal values."
-  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a\ntensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:\n\n`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.\n\nFor example:\n\n```prettyprint\n# \'diagonal\' is [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nand diagonal.shape = (2, 4)\n\ntf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]\n                                     [0, 2, 0, 0]\n                                     [0, 0, 3, 0]\n                                     [0, 0, 0, 4]],\n                                    [[5, 0, 0, 0]\n                                     [0, 6, 0, 0]\n                                     [0, 0, 7, 0]\n                                     [0, 0, 0, 8]]]\n\nwhich has shape (2, 4, 4)\n```"
+  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a\ntensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:\n\n`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.\n\nFor example:\n\n```\n# \'diagonal\' is [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nand diagonal.shape = (2, 4)\n\ntf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]\n                                     [0, 2, 0, 0]\n                                     [0, 0, 3, 0]\n                                     [0, 0, 0, 4]],\n                                    [[5, 0, 0, 0]\n                                     [0, 6, 0, 0]\n                                     [0, 0, 7, 0]\n                                     [0, 0, 0, 8]]]\n\nwhich has shape (2, 4, 4)\n```"
 }
 op {
   name: "MatrixDiagPart"
@@ -10412,7 +10632,7 @@ op {
     type: "type"
   }
   summary: "Returns the batched diagonal part of a batched tensor."
-  description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```prettyprint\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
+  description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
 }
 op {
   name: "MatrixInverse"
@@ -10791,19 +11011,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -10878,19 +11085,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -10903,19 +11097,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -10988,14 +11169,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -11752,7 +11925,7 @@ op {
     }
   }
   summary: "Pads a tensor with mirrored values."
-  description: "This operation pads a `input` with mirrored values according to the `paddings`\nyou specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is\nthe rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many values to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many values to add after the contents of `input`\nin that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater\nthan `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true\n(if false, respectively).\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```prettyprint\n# \'t\' is [[1, 2, 3], [4, 5, 6]].\n# \'paddings\' is [[1, 1]], [2, 2]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]\n                      [2, 1, 1, 2, 3, 3, 2]\n                      [5, 4, 4, 5, 6, 6, 5]\n                      [5, 4, 4, 5, 6, 6, 5]]\n```"
+  description: "This operation pads a `input` with mirrored values according to the `paddings`\nyou specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is\nthe rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many values to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many values to add after the contents of `input`\nin that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater\nthan `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true\n(if false, respectively).\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 2, 3], [4, 5, 6]].\n# \'paddings\' is [[1, 1]], [2, 2]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]\n                      [2, 1, 1, 2, 3, 3, 2]\n                      [5, 4, 4, 5, 6, 6, 5]\n                      [5, 4, 4, 5, 6, 6, 5]]\n```"
 }
 op {
   name: "MirrorPadGrad"
@@ -11800,7 +11973,7 @@ op {
     }
   }
   summary: "Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor."
-  description: "This operation folds the padded areas of `input` by `MirrorPad` according to the\n`paddings` you specify. `paddings` must be the same as `paddings` argument\ngiven to the corresponding `MirrorPad` op.\n\nThe folded size of each dimension D of the output is:\n\n`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`\n\nFor example:\n\n```prettyprint\n# \'t\' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].\n# \'paddings\' is [[0, 1]], [0, 1]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[ 1,  5]\n                      [11, 28]]\n```"
+  description: "This operation folds the padded areas of `input` by `MirrorPad` according to the\n`paddings` you specify. `paddings` must be the same as `paddings` argument\ngiven to the corresponding `MirrorPad` op.\n\nThe folded size of each dimension D of the output is:\n\n`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].\n# \'paddings\' is [[0, 1]], [0, 1]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[ 1,  5]\n                      [11, 28]]\n```"
 }
 op {
   name: "Mod"
@@ -11828,8 +12001,8 @@ op {
       }
     }
   }
-  summary: "Returns element-wise remainder of division."
-  description: "*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
+  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
+  description: "the result here is consistent with a truncating divide. E.g. `truncate(x / y) *\ny + truncate_mod(x, y) = x`.\n\n*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Mul"
@@ -11894,40 +12067,230 @@ op {
     description: "If either seed or seed2 is set to be non-zero, the internal random number\ngenerator is seeded by the given seed.  Otherwise, a random seed is used."
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "A second seed to avoid seed collision."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  summary: "Draws samples from a multinomial distribution."
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    description: "The key used to represent empty key buckets internally. Must not\nbe used in insert or lookup operations."
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+    description: "The shape of each value."
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+    description: "The initial number of hash table buckets. Must be a power\nto 2."
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+    description: "The maximum ratio between number of entries and number of\nbuckets before growing the table. Must be between 0 and 1."
+  }
+  summary: "Creates an empty hash table that uses tensors as the backing store."
+  description: "It uses \"open addressing\" with quadratic reprobing to resolve\ncollisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTableV2"
+  input_arg {
+    name: "empty_key"
+    description: "The key used to represent empty key buckets internally. Must not\nbe used in insert or lookup operations."
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+    description: "The shape of each value."
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+    description: "The initial number of hash table buckets. Must be a power\nto 2."
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+    description: "The maximum ratio between number of entries and number of\nbuckets before growing the table. Must be between 0 and 1."
+  }
+  summary: "Creates an empty hash table that uses tensors as the backing store."
+  description: "It uses \"open addressing\" with quadratic reprobing to resolve\ncollisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  is_stateful: true
+}
+op {
+  name: "MutableHashTable"
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    description: "A second seed to avoid seed collision."
+    description: "If true and shared_name is empty, the table is shared\nusing the node name."
   }
   attr {
-    name: "T"
+    name: "key_dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
+    description: "Type of the table keys."
   }
-  summary: "Draws samples from a multinomial distribution."
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  summary: "Creates an empty hash table."
+  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
-  name: "MutableDenseHashTable"
-  input_arg {
-    name: "empty_key"
-    description: "The key used to represent empty key buckets internally. Must not\nbe used in insert or lookup operations."
-    type_attr: "key_dtype"
-  }
+  name: "MutableHashTableOfTensors"
   output_arg {
     name: "table_handle"
     description: "Handle to a table."
@@ -11974,35 +12337,17 @@ op {
       shape {
       }
     }
-    description: "The shape of each value."
-  }
-  attr {
-    name: "initial_num_buckets"
-    type: "int"
-    default_value {
-      i: 131072
-    }
-    description: "The initial number of hash table buckets. Must be a power\nto 2."
-  }
-  attr {
-    name: "max_load_factor"
-    type: "float"
-    default_value {
-      f: 0.8
-    }
-    description: "The maximum ratio between number of entries and number of\nbuckets before growing the table. Must be between 0 and 1."
   }
-  summary: "Creates an empty hash table that uses tensors as the backing store. It uses"
-  description: "\"open addressing\" with quadratic reprobing to resolve collisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  summary: "Creates an empty hash table."
+  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a vector. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
-  name: "MutableHashTable"
+  name: "MutableHashTableOfTensorsV2"
   output_arg {
     name: "table_handle"
     description: "Handle to a table."
-    type: DT_STRING
-    is_ref: true
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -12026,7 +12371,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true and shared_name is empty, the table is shared\nusing the node name."
   }
   attr {
     name: "key_dtype"
@@ -12038,17 +12382,24 @@ op {
     type: "type"
     description: "Type of the table values."
   }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
   summary: "Creates an empty hash table."
-  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a vector. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
-  name: "MutableHashTableOfTensors"
+  name: "MutableHashTableV2"
   output_arg {
     name: "table_handle"
     description: "Handle to a table."
-    type: DT_STRING
-    is_ref: true
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -12072,6 +12423,7 @@ op {
     default_value {
       b: false
     }
+    description: "If true and shared_name is empty, the table is shared\nusing the node name."
   }
   attr {
     name: "key_dtype"
@@ -12083,16 +12435,8 @@ op {
     type: "type"
     description: "Type of the table values."
   }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
   summary: "Creates an empty hash table."
-  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a vector. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
@@ -12380,7 +12724,7 @@ op {
     description: "Dimension along which to pack.  Negative values wrap around, so the\nvalid range is `[-(R+1), R+1)`."
   }
   summary: "Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor."
-  description: "Packs the `N` tensors in `values` into a tensor with rank one higher than each\ntensor in `values`, by packing them along the `axis` dimension.\nGiven a list of tensors of shape `(A, B, C)`;\n\nif `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.\nif `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.\nEtc.\n\nFor example:\n\n```prettyprint\n# \'x\' is [1, 4]\n# \'y\' is [2, 5]\n# \'z\' is [3, 6]\npack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\npack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]\n```\n\nThis is the opposite of `unpack`."
+  description: "Packs the `N` tensors in `values` into a tensor with rank one higher than each\ntensor in `values`, by packing them along the `axis` dimension.\nGiven a list of tensors of shape `(A, B, C)`;\n\nif `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.\nif `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.\nEtc.\n\nFor example:\n\n```\n# \'x\' is [1, 4]\n# \'y\' is [2, 5]\n# \'z\' is [3, 6]\npack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\npack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]\n```\n\nThis is the opposite of `unpack`."
 }
 op {
   name: "Pad"
@@ -12414,7 +12758,7 @@ op {
     }
   }
   summary: "Pads a tensor with zeros."
-  description: "This operation pads a `input` with zeros according to the `paddings` you\nspecify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the\nrank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many zeros to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many zeros to add after the contents of `input`\nin that dimension.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```prettyprint\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
+  description: "This operation pads a `input` with zeros according to the `paddings` you\nspecify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the\nrank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many zeros to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many zeros to add after the contents of `input`\nin that dimension.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
 }
 op {
   name: "PaddingFIFOQueue"
@@ -12550,7 +12894,7 @@ op {
     description: "the final shape of the result; should be equal to the shapes of any input\nbut with the number of input values in the first dimension."
   }
   summary: "Concatenates a list of `N` tensors along the first dimension."
-  description: "The input tensors are all required to have size 1 in the first dimension.\n\nFor example:\n\n```prettyprint\n# \'x\' is [[1, 4]]\n# \'y\' is [[2, 5]]\n# \'z\' is [[3, 6]]\nparallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\n```\n\nThe difference between concat and parallel_concat is that concat requires all\nof the inputs be computed before the operation will begin but doesn\'t require\nthat the input shapes be known during graph construction.  Parallel concat\nwill copy pieces of the input into the output as they become available, in\nsome situations this can provide a performance benefit."
+  description: "The input tensors are all required to have size 1 in the first dimension.\n\nFor example:\n\n```\n# \'x\' is [[1, 4]]\n# \'y\' is [[2, 5]]\n# \'z\' is [[3, 6]]\nparallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\n```\n\nThe difference between concat and parallel_concat is that concat requires all\nof the inputs be computed before the operation will begin but doesn\'t require\nthat the input shapes be known during graph construction.  Parallel concat\nwill copy pieces of the input into the output as they become available, in\nsome situations this can provide a performance benefit."
 }
 op {
   name: "ParameterizedTruncatedNormal"
@@ -13035,7 +13379,7 @@ op {
     }
   }
   summary: "Compute the polygamma function \\\\(\\psi^{(n)}(x)\\\\)."
-  description: "The polygamma function is defined as:\n\n```\n\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\n```\nwhere \\\\(\\psi(x)\\\\) is the digamma function."
+  description: "The polygamma function is defined as:\n\n\n\\\\(\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\\\\)\n\nwhere \\\\(\\psi(x)\\\\) is the digamma function."
 }
 op {
   name: "Pow"
@@ -14810,8 +15154,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "If the queue is closed and there are fewer than n elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until n elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "If the queue is closed and there are fewer than `n` elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until `n` elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
 }
 op {
   name: "QueueDequeueManyV2"
@@ -14845,8 +15189,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "If the queue is closed and there are fewer than n elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until n elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "If the queue is closed and there are fewer than `n` elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until `n` elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
   is_stateful: true
 }
 op {
@@ -14882,8 +15226,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than n elements\nremaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If the queue\nis closed and there are 0 elements left in the queue, then an OutOfRange\nerror is returned just like in QueueDequeueMany.  Otherwise the behavior\nis identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than `n`\nelements remaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If\nthe queue is closed and there are 0 elements left in the queue, then\nan OutOfRange error is returned just like in QueueDequeueMany.\nOtherwise the behavior is identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has k outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple."
 }
 op {
   name: "QueueDequeueUpToV2"
@@ -14917,8 +15261,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than n elements\nremaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If the queue\nis closed and there are 0 elements left in the queue, then an OutOfRange\nerror is returned just like in QueueDequeueMany.  Otherwise the behavior\nis identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than `n`\nelements remaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If\nthe queue is closed and there are 0 elements left in the queue, then\nan OutOfRange error is returned just like in QueueDequeueMany.\nOtherwise the behavior is identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple."
   is_stateful: true
 }
 op {
@@ -15774,7 +16118,7 @@ op {
     type: "type"
   }
   summary: "Returns the rank of a tensor."
-  description: "This operation returns an integer representing the rank of `input`.\n\nFor example:\n\n```prettyprint\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\n# shape of tensor \'t\' is [2, 2, 3]\nrank(t) ==> 3\n```\n\n**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank\nof a tensor is the number of indices required to uniquely select each element\nof the tensor. Rank is also known as \"order\", \"degree\", or \"ndims.\""
+  description: "This operation returns an integer representing the rank of `input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\n# shape of tensor \'t\' is [2, 2, 3]\nrank(t) ==> 3\n```\n\n**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank\nof a tensor is the number of indices required to uniquely select each element\nof the tensor. Rank is also known as \"order\", \"degree\", or \"ndims.\""
 }
 op {
   name: "ReadFile"
@@ -16745,7 +17089,7 @@ op {
     }
   }
   summary: "Reshapes a tensor."
-  description: "Given `tensor`, this operation returns a tensor that has the same values\nas `tensor` with shape `shape`.\n\nIf one component of `shape` is the special value -1, the size of that dimension\nis computed so that the total size remains constant.  In particular, a `shape`\nof `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.\n\nIf `shape` is 1-D or higher, then the operation returns a tensor with shape\n`shape` filled with the values of `tensor`. In this case, the number of elements\nimplied by `shape` must be the same as the number of elements in `tensor`.\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [1, 2, 3, 4, 5, 6, 7, 8, 9]\n# tensor \'t\' has shape [9]\nreshape(t, [3, 3]) ==> [[1, 2, 3],\n                        [4, 5, 6],\n                        [7, 8, 9]]\n\n# tensor \'t\' is [[[1, 1], [2, 2]],\n#                [[3, 3], [4, 4]]]\n# tensor \'t\' has shape [2, 2, 2]\nreshape(t, [2, 4]) ==> [[1, 1, 2, 2],\n                        [3, 3, 4, 4]]\n\n# tensor \'t\' is [[[1, 1, 1],\n#                 [2, 2, 2]],\n#                [[3, 3, 3],\n#                 [4, 4, 4]],\n#                [[5, 5, 5],\n#                 [6, 6, 6]]]\n# tensor \'t\' has shape [3, 2, 3]\n# pass \'[-1]\' to flatten \'t\'\nreshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]\n\n# -1 can also be used to infer the shape\n\n# -1 is inferred to be 9:\nreshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 2:\nreshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 3:\nreshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],\n                              [2, 2, 2],\n                              [3, 3, 3]],\n                             [[4, 4, 4],\n                              [5, 5, 5],\n                              [6, 6, 6]]]\n\n# tensor \'t\' is [7]\n# shape `[]` reshapes to a scalar\nreshape(t, []) ==> 7\n```"
+  description: "Given `tensor`, this operation returns a tensor that has the same values\nas `tensor` with shape `shape`.\n\nIf one component of `shape` is the special value -1, the size of that dimension\nis computed so that the total size remains constant.  In particular, a `shape`\nof `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.\n\nIf `shape` is 1-D or higher, then the operation returns a tensor with shape\n`shape` filled with the values of `tensor`. In this case, the number of elements\nimplied by `shape` must be the same as the number of elements in `tensor`.\n\nFor example:\n\n```\n# tensor \'t\' is [1, 2, 3, 4, 5, 6, 7, 8, 9]\n# tensor \'t\' has shape [9]\nreshape(t, [3, 3]) ==> [[1, 2, 3],\n                        [4, 5, 6],\n                        [7, 8, 9]]\n\n# tensor \'t\' is [[[1, 1], [2, 2]],\n#                [[3, 3], [4, 4]]]\n# tensor \'t\' has shape [2, 2, 2]\nreshape(t, [2, 4]) ==> [[1, 1, 2, 2],\n                        [3, 3, 4, 4]]\n\n# tensor \'t\' is [[[1, 1, 1],\n#                 [2, 2, 2]],\n#                [[3, 3, 3],\n#                 [4, 4, 4]],\n#                [[5, 5, 5],\n#                 [6, 6, 6]]]\n# tensor \'t\' has shape [3, 2, 3]\n# pass \'[-1]\' to flatten \'t\'\nreshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]\n\n# -1 can also be used to infer the shape\n\n# -1 is inferred to be 9:\nreshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 2:\nreshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 3:\nreshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],\n                              [2, 2, 2],\n                              [3, 3, 3]],\n                             [[4, 4, 4],\n                              [5, 5, 5],\n                              [6, 6, 6]]]\n\n# tensor \'t\' is [7]\n# shape `[]` reshapes to a scalar\nreshape(t, []) ==> 7\n```"
 }
 op {
   name: "ResizeArea"
@@ -18528,6 +18872,81 @@ op {
   description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
   is_stateful: true
 }
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  summary: "Assign `value` to the sliced l-value reference of `ref`."
+  description: "The values of `value` are assigned to the positions in the variable\n`ref` that are selected by the slice parameters. The slice parameters\n`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.\n\nNOTE this op currently does not support broadcasting and so `value`\'s\nshape must be exactly the shape produced by the slice of `ref`."
+  is_stateful: true
+}
 op {
   name: "Restore"
   input_arg {
@@ -18663,11 +19082,12 @@ op {
         type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
   summary: "Reverses specific dimensions of a tensor."
-  description: "Given a `tensor`, and a `bool` tensor `dims` representing the dimensions\nof `tensor`, this operation reverses each dimension i of `tensor` where\n`dims[i]` is `True`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions\nof `tensor` must equal the number of elements in `dims`. In other words:\n\n`rank(tensor) = size(dims)`\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [False, False, False, True]\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is [False, True, False, False]\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is [False, False, True, False]\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
+  description: "Given a `tensor`, and a `bool` tensor `dims` representing the dimensions\nof `tensor`, this operation reverses each dimension i of `tensor` where\n`dims[i]` is `True`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions\nof `tensor` must equal the number of elements in `dims`. In other words:\n\n`rank(tensor) = size(dims)`\n\nFor example:\n\n```\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [False, False, False, True]\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is [False, True, False, False]\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is [False, False, True, False]\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
 }
 op {
   name: "ReverseSequence"
@@ -18717,7 +19137,7 @@ op {
     }
   }
   summary: "Reverses variable length slices."
-  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```prettyprint\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n\n```prettyprint\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
+  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n\n```\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
 }
 op {
   name: "ReverseV2"
@@ -18764,11 +19184,12 @@ op {
         type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
   summary: "Reverses specific dimensions of a tensor."
-  description: "NOTE `tf.reverse` has now changed behavior in preparation for 1.0.\n`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.\n\nGiven a `tensor`, and a `int32` tensor `axis` representing the set of\ndimensions of `tensor` to reverse. This operation reverses each dimension\n`i` for which there exists `j` s.t. `axis[j] == i`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions specified\nin `axis` may be 0 or more entries. If an index is specified more than\nonce, a InvalidArgument error is raised.\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [3] or \'dims\' is -1\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is \'[1]\' (or \'dims\' is \'[-3]\')\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is \'[2]\' (or \'dims\' is \'[-2]\')\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
+  description: "NOTE `tf.reverse` has now changed behavior in preparation for 1.0.\n`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.\n\nGiven a `tensor`, and a `int32` tensor `axis` representing the set of\ndimensions of `tensor` to reverse. This operation reverses each dimension\n`i` for which there exists `j` s.t. `axis[j] == i`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions specified\nin `axis` may be 0 or more entries. If an index is specified more than\nonce, a InvalidArgument error is raised.\n\nFor example:\n\n```\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [3] or \'dims\' is -1\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is \'[1]\' (or \'dims\' is \'[-3]\')\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is \'[2]\' (or \'dims\' is \'[-2]\')\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
 }
 op {
   name: "Rint"
@@ -19173,7 +19594,7 @@ op {
     description: "If True, the addition will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Adds sparse updates to a variable reference."
-  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] += updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] += updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterAdd.png\" alt>\n</div>"
+  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] += updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] += updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterAdd.png\" alt>\n</div>"
 }
 op {
   name: "ScatterDiv"
@@ -19313,17 +19734,17 @@ op {
   name: "ScatterNd"
   input_arg {
     name: "indices"
-    description: "A Tensor. Must be one of the following types: int32, int64.\nA tensor of indices into ref."
+    description: "Index tensor."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A Tensor. Must have the same type as tensor. A tensor of updated values\nto store in ref."
+    description: "Updates to scatter into output."
     type_attr: "T"
   }
   input_arg {
     name: "shape"
-    description: "A vector. The shape of the resulting tensor."
+    description: "1-D. The shape of the resulting tensor."
     type_attr: "Tindices"
   }
   output_arg {
@@ -19345,8 +19766,8 @@ op {
       }
     }
   }
-  summary: "Creates a new tensor by applying sparse `updates` to individual"
-  description: "values or slices within a zero tensor of the given `shape` tensor according to\nindices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)\noperator which extracts values or slices from a given tensor.\n\nTODO(simister): Add a link to Variable.__getitem__ documentation on slice\nsyntax.\n\n`shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank\n`Q`.\n\n`indices` must be integer tensor, containing indices into `shape`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `shape`.\n\n`updates` is Tensor of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].\n```\n\nThe simplest form of scatter is to insert individual elements in a tensor by\nindex. For example, say we want to insert 4 scattered elements in a rank-1\ntensor with 8 elements.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterNd1.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    shape = tf.constant([8])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print sess.run(scatter)\n```\n\nThe resulting tensor would look like this:\n\n    [0, 11, 0, 10, 9, 0, 0, 12]\n\nWe can also, insert entire slices of a higher rank tensor all at once. For\nexample, if we wanted to insert two slices in the first dimension of a\nrank-3 tensor with two matrices of new values.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterNd2.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[0], [2]])\n    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]],\n                           [[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]]])\n    shape = tf.constant([4, 4, 4])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print sess.run(scatter)\n```\n\nThe resulting tensor would look like this:\n\n    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],\n     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]"
+  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
+  description: "Creates a new tensor by applying sparse `updates` to individual\nvalues or slices within a zero tensor of the given `shape` according to\nindices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)\noperator which extracts values or slices from a given tensor.\n\n**WARNING**: The order in which updates are applied is nondeterministic, so the\noutput will be nondeterministic if `indices` contains duplicates.\n\n`indices` is an integer tensor containing indices into a new tensor of shape\n`shape`.  The last dimension of `indices` can be at most the rank of `shape`:\n\n    indices.shape[-1] <= shape.rank\n\nThe last dimension of `indices` corresponds to indices into elements\n(if `indices.shape[-1] = shape.rank`) or slices\n(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of\n`shape`.  `updates` is a tensor with shape\n\n    indices.shape[:-1] + shape[indices.shape[-1]:]\n\nThe simplest form of scatter is to insert individual elements in a tensor by\nindex. For example, say we want to insert 4 scattered elements in a rank-1\ntensor with 8 elements.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd1.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    shape = tf.constant([8])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [0, 11, 0, 10, 9, 0, 0, 12]\n\nWe can also, insert entire slices of a higher rank tensor all at once. For\nexample, if we wanted to insert two slices in the first dimension of a\nrank-3 tensor with two matrices of new values.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd2.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[0], [2]])\n    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]],\n                           [[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]]])\n    shape = tf.constant([4, 4, 4])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],\n     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]"
 }
 op {
   name: "ScatterNdAdd"
@@ -19596,7 +20017,7 @@ op {
     description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Subtracts sparse updates to a variable reference."
-  description: "```python\n    # Scalar indices\n    ref[indices, ...] -= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] -= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their (negated) contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterSub.png\" alt>\n</div>"
+  description: "```python\n    # Scalar indices\n    ref[indices, ...] -= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] -= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their (negated) contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterSub.png\" alt>\n</div>"
 }
 op {
   name: "ScatterUpdate"
@@ -19645,7 +20066,7 @@ op {
     description: "If True, the assignment will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Applies sparse updates to a variable reference."
-  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] = updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] = updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nIf values in `ref` is to be updated more than once, because there are\nduplicate entries in `indices`, the order at which the updates happen\nfor each value is undefined.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterUpdate.png\" alt>\n</div>"
+  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] = updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] = updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nIf values in `ref` is to be updated more than once, because there are\nduplicate entries in `indices`, the order at which the updates happen\nfor each value is undefined.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterUpdate.png\" alt>\n</div>"
 }
 op {
   name: "SdcaFprint"
@@ -19801,7 +20222,7 @@ op {
     minimum: 1
   }
   summary: "Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for"
-  description: "linear models with L1 + L2 regularization. As global optimization objective is\nstrongly-convex, the optimizer optimizes the dual objective at each step. The\noptimizer applies each update one example at a time. Examples are sampled\nuniformly, and the optimizer is learning rate free and enjoys linear convergence\nrate.\n\nProximal Stochastic Dual Coordinate Ascent, Shalev-Shwartz, Shai; Zhang, Tong.\n2012 arXiv1211.2717S: http://arxiv.org/pdf/1211.2717v1.pdf\n\n  Loss objective = \\sum f_{i}(wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|\n\nAdding vs. Averaging in Distributed Primal-Dual Optimization.\nChenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan, Peter Richtarik,\nMartin Takac http://arxiv.org/abs/1502.03508\n\nStochastic Dual Coordinate Ascent with Adaptive Probabilities\nDominik Csiba, Zheng Qu, Peter Richtarik https://arxiv.org/abs/1502.08053"
+  description: "linear models with L1 + L2 regularization. As global optimization objective is\nstrongly-convex, the optimizer optimizes the dual objective at each step. The\noptimizer applies each update one example at a time. Examples are sampled\nuniformly, and the optimizer is learning rate free and enjoys linear convergence\nrate.\n\n[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>\nShai Shalev-Shwartz, Tong Zhang. 2012\n\n$$Loss Objective = \\sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$\n\n[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>\nChenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,\nPeter Richtarik, Martin Takac. 2015\n\n[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>\nDominik Csiba, Zheng Qu, Peter Richtarik. 2015"
 }
 op {
   name: "SdcaShrinkL1"
@@ -20297,7 +20718,7 @@ op {
     }
   }
   summary: "Returns the shape of a tensor."
-  description: "This operation returns a 1-D integer tensor representing the shape of `input`.\n\nFor example:\n\n```prettyprint\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\nshape(t) ==> [2, 2, 3]\n```"
+  description: "This operation returns a 1-D integer tensor representing the shape of `input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\nshape(t) ==> [2, 2, 3]\n```"
 }
 op {
   name: "ShapeN"
@@ -20511,7 +20932,7 @@ op {
     }
   }
   summary: "Returns the size of a tensor."
-  description: "This operation returns an integer representing the number of elements in\n`input`.\n\nFor example:\n\n```prettyprint\n# \'t\' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]\nsize(t) ==> 12\n```"
+  description: "This operation returns an integer representing the number of elements in\n`input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]\nsize(t) ==> 12\n```"
 }
 op {
   name: "Skipgram"
@@ -20829,7 +21250,7 @@ op {
   }
   input_arg {
     name: "paddings"
-    description: "2-D tensor of non-negative integers with shape `[2, 2]`. It specifies\n  the padding of the input with zeros across the spatial dimensions as follows:\n\n      paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]\n\n  The effective spatial dimensions of the zero-padded input tensor will be:\n\n      height_pad = pad_top + height + pad_bottom\n      width_pad = pad_left + width + pad_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\n  * Non-overlapping blocks of size `block_size x block size` in the height and\n    width dimensions are rearranged into the batch dimension at each location.\n  * The batch of the output tensor is `batch * block_size * block_size`.\n  * Both height_pad and width_pad must be divisible by block_size.\n\nThe shape of the output will be:\n\n    [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,\n     depth]\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
+    description: "2-D tensor of non-negative integers with shape `[2, 2]`. It specifies\n  the padding of the input with zeros across the spatial dimensions as follows:\n\n      paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]\n\n  The effective spatial dimensions of the zero-padded input tensor will be:\n\n      height_pad = pad_top + height + pad_bottom\n      width_pad = pad_left + width + pad_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\n  * Non-overlapping blocks of size `block_size x block size` in the height and\n    width dimensions are rearranged into the batch dimension at each location.\n  * The batch of the output tensor is `batch * block_size * block_size`.\n  * Both height_pad and width_pad must be divisible by block_size.\n\nThe shape of the output will be:\n\n    [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,\n     depth]\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 2, 1]` and value:\n\n```\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
     type_attr: "Tpaddings"
   }
   output_arg {
@@ -20876,7 +21297,7 @@ op {
   }
   input_arg {
     name: "paddings"
-    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension\n  `i + 1`, which corresponds to spatial dimension `i`.  It is required that\n  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.\n\nThis operation is equivalent to the following steps:\n\n1. Zero-pad the start and end of dimensions `[1, ..., M]` of the\n   input according to `paddings` to produce `padded` of shape `padded_shape`.\n\n2. Reshape `padded` to `reshaped_padded` of shape:\n\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n       block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1],\n      block_shape[M-1]] +\n     remaining_shape\n\n3. Permute dimensions of `reshaped_padded` to produce\n   `permuted_reshaped_padded` of shape:\n\n     block_shape +\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\n4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch\n   dimension, producing an output tensor of shape:\n\n     [batch * prod(block_shape)] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and\n    paddings = `[[0, 0], [2, 0]]`:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 3, 1]` and value:\n\n```prettyprint\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
+    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension\n  `i + 1`, which corresponds to spatial dimension `i`.  It is required that\n  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.\n\nThis operation is equivalent to the following steps:\n\n1. Zero-pad the start and end of dimensions `[1, ..., M]` of the\n   input according to `paddings` to produce `padded` of shape `padded_shape`.\n\n2. Reshape `padded` to `reshaped_padded` of shape:\n\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n       block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1],\n      block_shape[M-1]] +\n     remaining_shape\n\n3. Permute dimensions of `reshaped_padded` to produce\n   `permuted_reshaped_padded` of shape:\n\n     block_shape +\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\n4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch\n   dimension, producing an output tensor of shape:\n\n     [batch * prod(block_shape)] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and\n    paddings = `[[0, 0], [2, 0]]`:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 3, 1]` and value:\n\n```\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
     type_attr: "Tpaddings"
   }
   output_arg {
@@ -20938,7 +21359,7 @@ op {
     minimum: 2
   }
   summary: "SpaceToDepth for tensors of type T."
-  description: "Rearranges blocks of spatial data, into depth. More specifically,\nthis op outputs a copy of the input tensor where values from the `height`\nand `width` dimensions are moved to the `depth` dimension.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Non-overlapping blocks of size `block_size x block size` are rearranged\n    into depth at each location.\n  * The depth of the output tensor is `input_depth * block_size * block_size`.\n  * The input tensor\'s height and width must be divisible by block_size.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height/block_size, width/block_size, depth*block_size*block_size]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and a divisor of both the input `height` and `width`.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:\n\n```prettyprint\nx = [[[[1], [2]],\n      [[3], [4]]]]\n```\n\nThis operation will output a tensor of shape `[1, 1, 1, 4]`:\n\n```prettyprint\n[[[[1, 2, 3, 4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,\nthe corresponding output will have a single element (i.e. width and height are\nboth 1) and will have a depth of 4 channels (1 * block_size * block_size).\nThe output element shape is `[1, 1, 4]`.\n\nFor an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThis operation, for block_size of 2, will return the following tensor of shape\n`[1, 1, 1, 12]`\n\n```prettyprint\n[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nSimilarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:\n\n```prettyprint\nx = [[[[1],   [2],  [5],  [6]],\n      [[3],   [4],  [7],  [8]],\n      [[9],  [10], [13],  [14]],\n      [[11], [12], [15],  [16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 2 2 4]`:\n\n```prettyprint\nx = [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```"
+  description: "Rearranges blocks of spatial data, into depth. More specifically,\nthis op outputs a copy of the input tensor where values from the `height`\nand `width` dimensions are moved to the `depth` dimension.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Non-overlapping blocks of size `block_size x block size` are rearranged\n    into depth at each location.\n  * The depth of the output tensor is `input_depth * block_size * block_size`.\n  * The input tensor\'s height and width must be divisible by block_size.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height/block_size, width/block_size, depth*block_size*block_size]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and a divisor of both the input `height` and `width`.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:\n\n```\nx = [[[[1], [2]],\n      [[3], [4]]]]\n```\n\nThis operation will output a tensor of shape `[1, 1, 1, 4]`:\n\n```\n[[[[1, 2, 3, 4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,\nthe corresponding output will have a single element (i.e. width and height are\nboth 1) and will have a depth of 4 channels (1 * block_size * block_size).\nThe output element shape is `[1, 1, 4]`.\n\nFor an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThis operation, for block_size of 2, will return the following tensor of shape\n`[1, 1, 1, 12]`\n\n```\n[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nSimilarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:\n\n```\nx = [[[[1],   [2],  [5],  [6]],\n      [[3],   [4],  [7],  [8]],\n      [[9],  [10], [13],  [14]],\n      [[11], [12], [15],  [16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 2 2 4]`:\n\n```\nx = [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```"
 }
 op {
   name: "SparseAccumulatorApplyGradient"
@@ -20996,8 +21417,8 @@ op {
     type: "bool"
     description: "Boolean indicating whether gradient_shape is unknown, in which\ncase the input is ignored during validation."
   }
-  summary: "Applies a sparse gradient to a given accumulator. Does not add if local_step is"
-  description: "lesser than the accumulator\'s global_step."
+  summary: "Applies a sparse gradient to a given accumulator."
+  description: "Does not add if local_step is smaller than the accumulator\'s\nglobal_step."
 }
 op {
   name: "SparseAccumulatorTakeGradient"
@@ -21050,8 +21471,8 @@ op {
       }
     }
   }
-  summary: "Extracts the average sparse gradient in the given SparseConditionalAccumulator,"
-  description: "provided that sufficient (i.e., more than num_required) gradients have been\naccumulated. The op will blocks until sufficient gradients have been\naccumulated. If the accumulator has already aggregated more than num_required\ngradients, it will return its average of the accumulated gradients.\nAlso automatically increments the recorded global_step in the accumulator by 1,\nand resets the aggregate to 0."
+  summary: "Extracts the average sparse gradient in a SparseConditionalAccumulator."
+  description: "The op will blocks until sufficient (i.e., more than num_required)\ngradients have been accumulated. If the accumulator has already\naggregated more than num_required gradients, it will return its\naverage of the accumulated gradients.  Also automatically increments\nthe recorded global_step in the accumulator by 1, and resets the\naggregate to 0."
 }
 op {
   name: "SparseAdd"
@@ -22136,10 +22557,115 @@ op {
     }
     description: "If non-empty, this accumulator will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A conditional accumulator for aggregating sparse gradients. The accumulator"
-  description: "accepts gradients marked with local_step greater or equal to the most recent\nglobal_step known to the accumulator. The average can be extracted from the\naccumulator, provided sufficient gradients have been accumulated. Extracting the\naverage automatically resets the aggregate to 0, and increments the global_step\nrecorded by the accumulator."
+  summary: "A conditional accumulator for aggregating sparse gradients."
+  description: "The accumulator accepts gradients marked with local_step greater or\nequal to the most recent global_step known to the accumulator. The\naverage can be extracted from the accumulator, provided sufficient\ngradients have been accumulated. Extracting the average automatically\nresets the aggregate to 0, and increments the global_step recorded by\nthe accumulator."
   is_stateful: true
 }
+op {
+  name: "SparseCross"
+  input_arg {
+    name: "indices"
+    description: "2-D.  Indices of each input `SparseTensor`."
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    description: "1-D.   values of each `SparseTensor`."
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    description: "1-D.   Shapes of each `SparseTensor`."
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    description: "2-D.    Columns represented by dense `Tensor`."
+    type_list_attr: "dense_types"
+  }
+  output_arg {
+    name: "output_indices"
+    description: "2-D.  Indices of the concatenated `SparseTensor`."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    description: "1-D.  Non-empty values of the concatenated or hashed\n`SparseTensor`."
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_shape"
+    description: "1-D.  Shape of the concatenated `SparseTensor`."
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hashed_output"
+    type: "bool"
+    description: "If true, returns the hash of the cross instead of the string.\nThis will allow us avoiding string manipulations."
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    description: "It is used if hashed_output is true.\noutput = hashed_value%num_buckets if num_buckets > 0 else hashed_value."
+    has_minimum: true
+  }
+  attr {
+    name: "hash_key"
+    type: "int"
+    description: "Specify the hash_key that will be used by the `FingerprintCat64`\nfunction to combine the crosses fingerprints."
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: "The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each\nrepresenting features of one feature column. It outputs a 2D `SparseTensor` with\nthe batchwise crosses of these features.\n\nFor example, if the inputs are\n\n    inputs[0]: SparseTensor with shape = [2, 2]\n    [0, 0]: \"a\"\n    [1, 0]: \"b\"\n    [1, 1]: \"c\"\n\n    inputs[1]: SparseTensor with shape = [2, 1]\n    [0, 0]: \"d\"\n    [1, 0]: \"e\"\n\n    inputs[2]: Tensor [[\"f\"], [\"g\"]]\n\nthen the output will be\n\n    shape = [2, 2]\n    [0, 0]: \"a_X_d_X_f\"\n    [1, 0]: \"b_X_e_X_g\"\n    [1, 1]: \"c_X_e_X_g\"\n\nif hashed_output=true then the output will be\n\n    shape = [2, 2]\n    [0, 0]: FingerprintCat64(\n                Fingerprint64(\"f\"), FingerprintCat64(\n                    Fingerprint64(\"d\"), Fingerprint64(\"a\")))\n    [1, 0]: FingerprintCat64(\n                Fingerprint64(\"g\"), FingerprintCat64(\n                    Fingerprint64(\"e\"), Fingerprint64(\"b\")))\n    [1, 1]: FingerprintCat64(\n                Fingerprint64(\"g\"), FingerprintCat64(\n                    Fingerprint64(\"e\"), Fingerprint64(\"c\")))"
+}
 op {
   name: "SparseDenseCwiseAdd"
   input_arg {
@@ -23550,7 +24076,7 @@ op {
     has_minimum: true
   }
   summary: "Removes dimensions of size 1 from the shape of a tensor."
-  description: "Given a tensor `input`, this operation returns a tensor of the same type with\nall dimensions of size 1 removed. If you don\'t want to remove all size 1\ndimensions, you can remove specific size 1 dimensions by specifying\n`squeeze_dims`.\n\nFor example:\n\n```prettyprint\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t)) ==> [2, 3]\n```\n\nOr, to remove specific size 1 dimensions:\n\n```prettyprint\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]\n```"
+  description: "Given a tensor `input`, this operation returns a tensor of the same type with\nall dimensions of size 1 removed. If you don\'t want to remove all size 1\ndimensions, you can remove specific size 1 dimensions by specifying\n`squeeze_dims`.\n\nFor example:\n\n```\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t)) ==> [2, 3]\n```\n\nOr, to remove specific size 1 dimensions:\n\n```\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]\n```"
 }
 op {
   name: "Stack"
@@ -23667,8 +24193,8 @@ op {
     }
     description: "It is necessary to match this name to the matching Unstage Op."
   }
-  summary: "Stage values similar to a lightweight Enqueue.  The basic functionality of this"
-  description: "Op is similar to a queue with many fewer capabilities and options.  This Op is\noptimized for performance."
+  summary: "Stage values similar to a lightweight Enqueue."
+  description: "The basic functionality of this Op is similar to a queue with many\nfewer capabilities and options.  This Op is optimized for performance."
   is_stateful: true
 }
 op {
@@ -23912,7 +24438,7 @@ op {
     description: "a bitmask where bit `i` implies that the `i`th\nspecification should shrink the dimensionality. begin and end\nmust imply a slice of size 1 in the dimension. For example in\npython one might do `foo[:, 3, :]` which would result in\n`shrink_axis_mask` being 2."
   }
   summary: "Return a strided slice from `input`."
-  description: "Note, most python users will want to use the Python `Tensor.__getitem__`\nor `Variable.__getitem__` rather than this op directly.\n\nThe goal of this op is to produce a new tensor with a subset of\nthe elements from the `n` dimensional `input` tensor. The subset is chosen using\na sequence of `m` sparse range specifications encoded into the arguments\nof this function. Note, in some cases\n`m` could be equal to `n`, but this need not be the case. Each\nrange specification entry can be one of the following:\n\n- An ellipsis (...). Ellipses are used to imply zero or more\n  dimensions of full-dimension selection and are produced using\n  `ellipsis_mask`. For example, `foo[...]` is the identity slice.\n\n- A new axis. This is used to insert a new shape=1 dimension and is\n  produced using `new_axis_mask`. For example, `foo[:, ...]` where\n  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.\n\n\n- A range `begin:end:stride`. This is used to specify how much to choose from\n  a given dimension. `stride` can be any integer but 0.  `begin` is an integer\n  which represents the index of the first value to select while `end` represents\n  the index of the last value to select. The number of values selected in each\n  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.\n  `begin` and `end` can be negative where `-1` is the last element, `-2` is\n  the second to last. `begin_mask` controls whether to replace the explicitly\n  given `begin` with an implicit effective value of `0` if `stride > 0` and\n  `-1` if `stride < 0`. `end_mask` is analogous but produces the number\n  required to create the largest open interval. For example, given a shape\n  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do\n  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`\n  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the\n  first dimension of a tensor while dropping the last two (in the original\n  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.\n\n- A single index. This is used to keep only elements that have a given\n  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a\n  shape `(6,)` tensor. This is encoded in `begin` and `end` and\n  `shrink_axis_mask`.\n\nEach conceptual range specification is encoded in the op\'s argument. This\nencoding is best understand by considering a non-trivial example. In\nparticular,\n`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as\n\n```prettyprint\nbegin = [1, 2, x, x, 0, x] # x denotes don\'t care (usually 0)\nend = [2, 4, x, x, -3, x]\nstrides = [1, 1, x, x, -1, 1]\nbegin_mask = 1<<4 | 1 << 5 = 48\nend_mask = 1<<5 = 32\nellipsis_mask = 1<<3 = 8\nnew_axis_mask = 1<<2 4\nshrink_axis_mask = 1<<0\n```\n\nIn this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of\nthe slice becomes (2, 1, 5, 5, 2, 5).\nLet us walk step by step through each argument specification.\n\n1.  The first argument in the example slice is turned into `begin = 1` and\n`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we\nalso set the appropriate bit in `shrink_axis_mask`.\n\n2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have\nzero bits contributed.\n\n3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1\ndimension in the final shape. Dummy values are contributed to begin,\nend and stride, while the new_axis_mask bit is set.\n\n4. `...` grab the full ranges from as many dimensions as needed to\nfully specify a slice for every dimension of the input shape.\n\n5. `:-3:-1` shows the use of negative indices. A negative index `i` associated\nwith a dimension that has shape `s` is converted to a positive index\n`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion\nis done internally so begin, end and strides receive x, -3, and -1.\nThe appropriate begin_mask bit is set to indicate the start range is the\nfull range (ignoring the x).\n\n6. `:` indicates that the entire contents of the corresponding dimension\nis selected. This is equivalent to `::` or `0::1`. begin, end, and strides\nreceive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and\n`end_mask` are also set.\n\n*Requirements*:\n  `0 != strides[i] for i in [0, m)`\n  `ellipsis_mask must be a power of two (only one ellipsis)`"
+  description: "Note, most python users will want to use the Python `Tensor.__getitem__`\nor `Variable.__getitem__` rather than this op directly.\n\nThe goal of this op is to produce a new tensor with a subset of\nthe elements from the `n` dimensional `input` tensor. The subset is chosen using\na sequence of `m` sparse range specifications encoded into the arguments\nof this function. Note, in some cases\n`m` could be equal to `n`, but this need not be the case. Each\nrange specification entry can be one of the following:\n\n- An ellipsis (...). Ellipses are used to imply zero or more\n  dimensions of full-dimension selection and are produced using\n  `ellipsis_mask`. For example, `foo[...]` is the identity slice.\n\n- A new axis. This is used to insert a new shape=1 dimension and is\n  produced using `new_axis_mask`. For example, `foo[:, ...]` where\n  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.\n\n\n- A range `begin:end:stride`. This is used to specify how much to choose from\n  a given dimension. `stride` can be any integer but 0.  `begin` is an integer\n  which represents the index of the first value to select while `end` represents\n  the index of the last value to select. The number of values selected in each\n  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.\n  `begin` and `end` can be negative where `-1` is the last element, `-2` is\n  the second to last. `begin_mask` controls whether to replace the explicitly\n  given `begin` with an implicit effective value of `0` if `stride > 0` and\n  `-1` if `stride < 0`. `end_mask` is analogous but produces the number\n  required to create the largest open interval. For example, given a shape\n  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do\n  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`\n  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the\n  first dimension of a tensor while dropping the last two (in the original\n  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.\n\n- A single index. This is used to keep only elements that have a given\n  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a\n  shape `(6,)` tensor. This is encoded in `begin` and `end` and\n  `shrink_axis_mask`.\n\nEach conceptual range specification is encoded in the op\'s argument. This\nencoding is best understand by considering a non-trivial example. In\nparticular,\n`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as\n\n```\nbegin = [1, 2, x, x, 0, x] # x denotes don\'t care (usually 0)\nend = [2, 4, x, x, -3, x]\nstrides = [1, 1, x, x, -1, 1]\nbegin_mask = 1<<4 | 1 << 5 = 48\nend_mask = 1<<5 = 32\nellipsis_mask = 1<<3 = 8\nnew_axis_mask = 1<<2 4\nshrink_axis_mask = 1<<0\n```\n\nIn this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of\nthe slice becomes (2, 1, 5, 5, 2, 5).\nLet us walk step by step through each argument specification.\n\n1.  The first argument in the example slice is turned into `begin = 1` and\n`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we\nalso set the appropriate bit in `shrink_axis_mask`.\n\n2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have\nzero bits contributed.\n\n3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1\ndimension in the final shape. Dummy values are contributed to begin,\nend and stride, while the new_axis_mask bit is set.\n\n4. `...` grab the full ranges from as many dimensions as needed to\nfully specify a slice for every dimension of the input shape.\n\n5. `:-3:-1` shows the use of negative indices. A negative index `i` associated\nwith a dimension that has shape `s` is converted to a positive index\n`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion\nis done internally so begin, end and strides receive x, -3, and -1.\nThe appropriate begin_mask bit is set to indicate the start range is the\nfull range (ignoring the x).\n\n6. `:` indicates that the entire contents of the corresponding dimension\nis selected. This is equivalent to `::` or `0::1`. begin, end, and strides\nreceive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and\n`end_mask` are also set.\n\n*Requirements*:\n  `0 != strides[i] for i in [0, m)`\n  `ellipsis_mask must be a power of two (only one ellipsis)`"
 }
 op {
   name: "StridedSliceAssign"
@@ -24780,8 +25306,8 @@ op {
     description: "The handle to a TensorArray (output of TensorArray or TensorArrayGrad)."
     type: DT_RESOURCE
   }
-  summary: "Delete the TensorArray from its resource container.  This enables"
-  description: "the user to close and release the resource in the middle of a step/run."
+  summary: "Delete the TensorArray from its resource container."
+  description: "This enables the user to close and release the resource in the middle\nof a step/run."
   is_stateful: true
 }
 op {
@@ -25572,8 +26098,8 @@ op {
     }
     description: "Overrides the name used for the temporary tensor_array\nresource. Default value is the name of the \'TensorArray\' op (which\nis guaranteed unique)."
   }
-  summary: "An array of Tensors of given size, with data written via Write and read"
-  description: "via Read or Pack."
+  summary: "An array of Tensors of given size."
+  description: "Write data via Write and read via Read or Pack."
   is_stateful: true
 }
 op {
@@ -25813,7 +26339,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -25847,6 +26373,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Tile"
@@ -26110,8 +26637,8 @@ op {
       }
     }
   }
-  summary: "Returns element-wise remainder of division. This emulates C semantics where"
-  description: "true, this follows C semantics in that the result here is consistent\nwith a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.\n\n*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
+  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
+  description: "the result here is consistent with a truncating divide. E.g. `truncate(x / y) *\ny + truncate_mod(x, y) = x`.\n\n*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "TruncatedNormal"
@@ -26199,7 +26726,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -26233,6 +26760,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a uniform distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Unique"
@@ -26269,7 +26797,7 @@ op {
     }
   }
   summary: "Finds unique elements in a 1-D tensor."
-  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```prettyprint\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx = unique(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\n```"
+  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx = unique(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\n```"
 }
 op {
   name: "UniqueWithCounts"
@@ -26311,7 +26839,7 @@ op {
     }
   }
   summary: "Finds unique elements in a 1-D tensor."
-  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. Finally, it returns a third tensor `count` that\ncontains the count of each element of `y` in `x`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```prettyprint\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx, count = unique_with_counts(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\ncount ==> [2, 1, 3, 1, 2]\n```"
+  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. Finally, it returns a third tensor `count` that\ncontains the count of each element of `y` in `x`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx, count = unique_with_counts(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\ncount ==> [2, 1, 3, 1, 2]\n```"
 }
 op {
   name: "Unpack"
@@ -26477,8 +27005,8 @@ op {
       s: ""
     }
   }
-  summary: "Op is similar to a lightweight Dequeue.  The basic funtionality is similar to"
-  description: "dequeue with many fewer capabilities and options.  This Op is optimized for\nperformance."
+  summary: "Op is similar to a lightweight Dequeue."
+  description: "The basic funtionality is similar to dequeue with many fewer\ncapabilities and options.  This Op is optimized for performance."
   is_stateful: true
 }
 op {
@@ -26562,7 +27090,7 @@ op {
     type: DT_INT64
   }
   summary: "Returns locations of true values in a boolean tensor."
-  description: "This operation returns the coordinates of true elements in `input`. The\ncoordinates are returned in a 2-D tensor where the first dimension (rows)\nrepresents the number of true elements, and the second dimension (columns)\nrepresents the coordinates of the true elements. Keep in mind, the shape of\nthe output tensor can vary depending on how many true values there are in\n`input`. Indices are output in row-major order.\n\nFor example:\n\n```prettyprint\n# \'input\' tensor is [[True, False]\n#                    [True, False]]\n# \'input\' has two true values, so output has two coordinates.\n# \'input\' has rank of 2, so coordinates have two indices.\nwhere(input) ==> [[0, 0],\n                  [1, 0]]\n\n# `input` tensor is [[[True, False]\n#                     [True, False]]\n#                    [[False, True]\n#                     [False, True]]\n#                    [[False, False]\n#                     [False, True]]]\n# \'input\' has 5 true values, so output has 5 coordinates.\n# \'input\' has rank of 3, so coordinates have three indices.\nwhere(input) ==> [[0, 0, 0],\n                  [0, 1, 0],\n                  [1, 0, 1],\n                  [1, 1, 1],\n                  [2, 1, 1]]\n```"
+  description: "This operation returns the coordinates of true elements in `input`. The\ncoordinates are returned in a 2-D tensor where the first dimension (rows)\nrepresents the number of true elements, and the second dimension (columns)\nrepresents the coordinates of the true elements. Keep in mind, the shape of\nthe output tensor can vary depending on how many true values there are in\n`input`. Indices are output in row-major order.\n\nFor example:\n\n```\n# \'input\' tensor is [[True, False]\n#                    [True, False]]\n# \'input\' has two true values, so output has two coordinates.\n# \'input\' has rank of 2, so coordinates have two indices.\nwhere(input) ==> [[0, 0],\n                  [1, 0]]\n\n# `input` tensor is [[[True, False]\n#                     [True, False]]\n#                    [[False, True]\n#                     [False, True]]\n#                    [[False, False]\n#                     [False, True]]]\n# \'input\' has 5 true values, so output has 5 coordinates.\n# \'input\' has rank of 3, so coordinates have three indices.\nwhere(input) ==> [[0, 0, 0],\n                  [0, 1, 0],\n                  [1, 0, 1],\n                  [1, 1, 1],\n                  [2, 1, 1]]\n```"
 }
 op {
   name: "WholeFileReader"
@@ -26676,5 +27204,5 @@ op {
     }
   }
   summary: "Compute the Hurwitz zeta function \\\\(\\zeta(x, q)\\\\)."
-  description: "The Hurwitz zeta function is defined as:\n\n```\n\\zeta(x, q) = \\sum_{n=0}^{\\infty} (q + n)^{-x}\n```"
+  description: "The Hurwitz zeta function is defined as:\n\n\n\\\\(\\zeta(x, q) = \\sum_{n=0}^{\\infty} (q + n)^{-x}\\\\)"
 }
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index c190b81dde3a346a08bf82c5b4644bf02e5c6d23..c060aa6be91d6675a6c2acdadd532151e509fcdf 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -295,7 +295,7 @@ the same location, their contributions add.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterAdd.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
 </div>
 
 resource: Should be from a `Variable` node.
diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc
index 2029ed7de22fc26bddfc2a8e71eb132e6fe9358c..dea75a1af83456f730a6c98cc40fd26d02ca2fda 100644
--- a/tensorflow/core/ops/sdca_ops.cc
+++ b/tensorflow/core/ops/sdca_ops.cc
@@ -72,17 +72,17 @@ optimizer applies each update one example at a time. Examples are sampled
 uniformly, and the optimizer is learning rate free and enjoys linear convergence
 rate.
 
-Proximal Stochastic Dual Coordinate Ascent, Shalev-Shwartz, Shai; Zhang, Tong.
-2012 arXiv1211.2717S: http://arxiv.org/pdf/1211.2717v1.pdf
+[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+Shai Shalev-Shwartz, Tong Zhang. 2012
 
-  Loss objective = \sum f_{i}(wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|
+$$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
 
-Adding vs. Averaging in Distributed Primal-Dual Optimization.
-Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan, Peter Richtarik,
-Martin Takac http://arxiv.org/abs/1502.03508
+[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+Peter Richtarik, Martin Takac. 2015
 
-Stochastic Dual Coordinate Ascent with Adaptive Probabilities
-Dominik Csiba, Zheng Qu, Peter Richtarik https://arxiv.org/abs/1502.08053
+[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 
 loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
   squared and hinge losses.
@@ -105,7 +105,7 @@ example_weights: a vector which contains the weight associated with each
 example_labels: a vector which contains the label/target associated with each
   example.
 sparse_indices: a list of vectors where each value is the indices which has
-  corresponding weights in sparse_weights. This field maybe ommitted for the
+  corresponding weights in sparse_weights. This field maybe omitted for the
   dense approach.
 sparse_weights: a list of vectors where each value is the weight associated with
   a sparse feature group.
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index b90f7a5dfb8874eae6a60e2c23e8941514316f84..9bbf37cfc2cae4bdfdd61f71f43a2ced501a1503 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -456,6 +456,84 @@ concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
     where rank is the number of dimensions in each input `SparseTensor`.
 )doc");
 
+REGISTER_OP("SparseCross")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Output("output_indices: int64")
+    .Output("output_values: out_type")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("hashed_output: bool")
+    .Attr("num_buckets: int >= 0")
+    .Attr("hash_key: int")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .Attr("out_type: {int64, string}")
+    .Attr("internal_type: {int64, string}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Generates sparse cross from a list of sparse and dense tensors.
+
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+
+indices: 2-D.  Indices of each input `SparseTensor`.
+values: 1-D.   values of each `SparseTensor`.
+shapes: 1-D.   Shapes of each `SparseTensor`.
+dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+hashed_output: If true, returns the hash of the cross instead of the string.
+  This will allow us avoiding string manipulations.
+num_buckets: It is used if hashed_output is true.
+  output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+  function to combine the crosses fingerprints.
+output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+output_values: 1-D.  Non-empty values of the concatenated or hashed
+  `SparseTensor`.
+output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+)doc");
+
 REGISTER_OP("SparseSplit")
     .Input("split_dim: int64")
     .Input("indices: int64")
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index cfb3ea7141172e1429051c7bea88c918969d0124..0890d5fc7c77ac4e930f69680345d17ef9bff364 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -288,7 +288,7 @@ for each value is undefined.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterUpdate.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterUpdate.png" alt>
 </div>
 
 ref: Should be from a `Variable` node.
@@ -332,7 +332,7 @@ the same location, their contributions add.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterAdd.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
 </div>
 
 ref: Should be from a `Variable` node.
@@ -376,7 +376,7 @@ the same location, their (negated) contributions add.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterSub.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterSub.png" alt>
 </div>
 
 ref: Should be from a `Variable` node.
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index f683f25f7e7af7debed02484e6ee56d9943083e1..ff7b4e8ec81f9d4792cd8c876086cd5031d63c6c 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -1174,6 +1174,7 @@ REGISTER_OP("ApplyAdam")
     .Output("out: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
     })
@@ -1199,6 +1200,7 @@ out: Same as "var".
 use_locking: If `True`, updating of the var, m, and v tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
+use_nesterov: If `True`, uses the nesterov update.
 )doc");
 
 REGISTER_OP("ResourceApplyAdam")
@@ -1214,6 +1216,7 @@ REGISTER_OP("ResourceApplyAdam")
     .Input("grad: T")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
     })
@@ -1238,6 +1241,7 @@ grad: The gradient.
 use_locking: If `True`, updating of the var, m, and v tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
+use_nesterov: If `True`, uses the nesterov update.
 )doc");
 
 static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index 9c3489211c8eed4013555e5aee7d53817e43e8a1..da66fbe4ba432d5a29470d67cef3dfd07b56034b 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -21,9 +21,9 @@ limitations under the License.
 namespace tensorflow {
 
 // Used for testing the grad+indices handling for SparseApplyXYZ tests.
-static void TestGradAndIndicesErrorHandling(ShapeInferenceTestOp op,
+static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op,
                                             string shape_spec_middle,
-                                            string shape_spec_end = "") {
+                                            const string& shape_spec_end = "") {
   auto shape_spec = [&shape_spec_middle, shape_spec_end](
       const char* var_spec, const char* grad_indices_spec) {
     return strings::StrCat(var_spec, ";", shape_spec_middle, ";",
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 5ee3099673f158e4736c43795027f4680de9c963..97e4c207d86fc6c8c12595be2de8707e9b3832f6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -52,6 +52,8 @@ constexpr int kGetChildrenDefaultPageSize = 1000;
 constexpr uint64 kUploadRetryDelayMicros = 1000000L;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
+// The environment variable that overrides the size of the readahead buffer.
+constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
 
 // The file statistics returned by Stat() for directories.
 const FileStatistics DIRECTORY_STAT(0, 0, true);
@@ -585,7 +587,16 @@ class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 
 GcsFileSystem::GcsFileSystem()
     : auth_provider_(new GoogleAuthProvider()),
-      http_request_factory_(new HttpRequest::Factory()) {}
+      http_request_factory_(new HttpRequest::Factory()) {
+  // Apply the sys env override for the readahead buffer size if it's provided.
+  const char* readahead_buffer_size = std::getenv(kReadaheadBufferSize);
+  if (readahead_buffer_size) {
+    uint64 value;
+    if (strings::safe_strtou64(readahead_buffer_size, &value)) {
+      read_ahead_bytes_ = value;
+    }
+  }
+}
 
 GcsFileSystem::GcsFileSystem(
     std::unique_ptr<AuthProvider> auth_provider,
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 6a6437f070e348dc9b0c98c5550dceedeecfa13d..18d2de482bb27298bea7f45ad8c6f167fab6c286 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -74,6 +74,7 @@ class GcsFileSystem : public FileSystem {
 
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs) override;
+  size_t get_readahead_buffer_size() const { return read_ahead_bytes_; }
 
  private:
   /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
@@ -112,7 +113,7 @@ class GcsFileSystem : public FileSystem {
 
   // The number of bytes to read ahead for buffering purposes in the
   // RandomAccessFile implementation. Defaults to 256Mb.
-  const size_t read_ahead_bytes_ = 256 * 1024 * 1024;
+  size_t read_ahead_bytes_ = 256 * 1024 * 1024;
 
   // The initial delay for exponential backoffs when retrying failed calls.
   const int64 initial_retry_delay_usec_ = 1000000L;
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index fc79f3be110569d959dd33d50bd3fc37fe83327e..c3a8678fbc6fce15354a2b50a742f02413c46ace 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -1617,5 +1617,14 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
   EXPECT_EQ(1, undeleted_dirs);
 }
 
+TEST(GcsFileSystemTest, OverrideReadaheadBufferSize) {
+  GcsFileSystem fs1;
+  EXPECT_EQ(256 * 1024 * 1024, fs1.get_readahead_buffer_size());
+
+  setenv("GCS_READAHEAD_BUFFER_SIZE_BYTES", "123456789", 1);
+  GcsFileSystem fs2;
+  EXPECT_EQ(123456789L, fs2.get_readahead_buffer_size());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index 6f29d4597f18dce3813c69f86f1cdab3eca561ff..f70b431b6524cc566fa9c20fbdd5c4d555b501c6 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -67,7 +67,7 @@ constexpr char kGceTokenUrl[] =
 // The authentication token scope to request.
 constexpr char kOAuthScope[] = "https://www.googleapis.com/auth/cloud-platform";
 
-// The default intial delay between retries with exponential backoff.
+// The default initial delay between retries with exponential backoff.
 constexpr int kInitialRetryDelayUsec = 500000;  // 0.5 sec
 
 /// Returns whether the given path points to a readable file.
diff --git a/tensorflow/core/platform/cloud/http_request.cc b/tensorflow/core/platform/cloud/http_request.cc
index 825741f61409eb762a0f7ea6c4a8d559c17d525d..2d0141e50e786b8275272cce29be269c6fe8afe0 100644
--- a/tensorflow/core/platform/cloud/http_request.cc
+++ b/tensorflow/core/platform/cloud/http_request.cc
@@ -35,6 +35,10 @@ constexpr uint32 kRequestTimeoutSeconds = 3600;  // 1 hour
 // Timeout for the connection phase.
 constexpr uint32 kConnectTimeoutSeconds = 120;  // 2 minutes
 
+// The maximum period of request inactivity, after which the request
+// is terminated.
+constexpr uint64 kInactivityTimeoutSeconds = 60;  // 1 minute
+
 // Proxy to the real libcurl implementation.
 class LibCurlProxy : public LibCurl {
  public:
@@ -75,6 +79,13 @@ class LibCurlProxy : public LibCurl {
     return ::curl_easy_setopt(curl, option, param);
   }
 
+  CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                            int (*param)(void* clientp, curl_off_t dltotal,
+                                         curl_off_t dlnow, curl_off_t ultotal,
+                                         curl_off_t ulnow)) override {
+    return ::curl_easy_setopt(curl, option, param);
+  }
+
   CURLcode curl_easy_perform(CURL* curl) override {
     return ::curl_easy_perform(curl);
   }
@@ -111,7 +122,8 @@ class LibCurlProxy : public LibCurl {
 
 HttpRequest::HttpRequest() : HttpRequest(LibCurlProxy::Load()) {}
 
-HttpRequest::HttpRequest(LibCurl* libcurl) : libcurl_(libcurl) {
+HttpRequest::HttpRequest(LibCurl* libcurl, Env* env)
+    : libcurl_(libcurl), env_(env) {
   default_response_buffer_.reserve(CURL_MAX_WRITE_SIZE);
 }
 
@@ -152,6 +164,12 @@ Status HttpRequest::Init() {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION,
                              CURL_HTTP_VERSION_2_0);
 
+  // Set up the progress meter.
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_NOPROGRESS, 0ULL);
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFODATA, this);
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFOFUNCTION,
+                             &HttpRequest::ProgressCallback);
+
   // If response buffer is not set, libcurl will print results to stdout,
   // so we always set it.
   is_initialized_ = true;
@@ -470,4 +488,31 @@ string HttpRequest::GetResponseHeader(const string& name) const {
 
 uint64 HttpRequest::GetResponseCode() const { return response_code_; }
 
+// Cancels the transmission if no progress has been made for too long.
+int HttpRequest::ProgressCallback(void* this_object, curl_off_t dltotal,
+                                  curl_off_t dlnow, curl_off_t ultotal,
+                                  curl_off_t ulnow) {
+  auto that = reinterpret_cast<HttpRequest*>(this_object);
+  const auto now = that->env_->NowSeconds();
+  const auto current_progress = dlnow + ulnow;
+  if (that->last_progress_timestamp_ == 0 ||
+      current_progress > that->last_progress_bytes_) {
+    // This is the first time the callback is called or some progress
+    // was made since the last tick.
+    that->last_progress_timestamp_ = now;
+    that->last_progress_bytes_ = current_progress;
+    return 0;
+  }
+
+  if (now - that->last_progress_timestamp_ > kInactivityTimeoutSeconds) {
+    LOG(ERROR) << "The transmission has been stuck at " << current_progress
+               << " bytes for " << now - that->last_progress_timestamp_
+               << " seconds and will be aborted.";
+    return 1;  // Will abort the request.
+  }
+
+  // No progress was made since the last call, but we should wait a bit longer.
+  return 0;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/http_request.h b/tensorflow/core/platform/cloud/http_request.h
index 5365c45ca9bed2264a8cfc44ebc8ef39ceb1f6b9..afcbb9f35cfba478746b7e9beff6135aba32fa1d 100644
--- a/tensorflow/core/platform/cloud/http_request.h
+++ b/tensorflow/core/platform/cloud/http_request.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,7 +51,9 @@ class HttpRequest {
   };
 
   HttpRequest();
-  explicit HttpRequest(LibCurl* libcurl);
+  explicit HttpRequest(LibCurl* libcurl)
+      : HttpRequest(libcurl, Env::Default()) {}
+  HttpRequest(LibCurl* libcurl, Env* env);
   virtual ~HttpRequest();
 
   virtual Status Init();
@@ -123,11 +126,16 @@ class HttpRequest {
   /// A header callback in the form which can be accepted by libcurl.
   static size_t HeaderCallback(const void* ptr, size_t size, size_t nmemb,
                                void* this_object);
+  /// A progress meter callback in the form which can be accepted by libcurl.
+  static int ProgressCallback(void* this_object, curl_off_t dltotal,
+                              curl_off_t dlnow, curl_off_t ultotal,
+                              curl_off_t ulnow);
   Status CheckInitialized() const;
   Status CheckMethodNotSet() const;
   Status CheckNotSent() const;
 
   LibCurl* libcurl_;
+  Env* env_;
 
   FILE* put_body_ = nullptr;
 
@@ -144,6 +152,12 @@ class HttpRequest {
   std::unordered_map<string, string> response_headers_;
   uint64 response_code_ = 0;
 
+  // The timestamp of the last activity related to the request execution, in
+  // seconds since epoch.
+  uint64 last_progress_timestamp_ = 0;
+  // The last progress in terms of bytes transmitted.
+  curl_off_t last_progress_bytes_ = 0;
+
   // Members to enforce the usage flow.
   bool is_initialized_ = false;
   bool is_uri_set_ = false;
@@ -173,6 +187,10 @@ class LibCurl {
   virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
                                     size_t (*param)(const void*, size_t, size_t,
                                                     void*)) = 0;
+  virtual CURLcode curl_easy_setopt(
+      CURL* curl, CURLoption option,
+      int (*param)(void* clientp, curl_off_t dltotal, curl_off_t dlnow,
+                   curl_off_t ultotal, curl_off_t ulnow)) = 0;
   virtual CURLcode curl_easy_perform(CURL* curl) = 0;
   virtual CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
                                      uint64* value) = 0;
diff --git a/tensorflow/core/platform/cloud/http_request_test.cc b/tensorflow/core/platform/cloud/http_request_test.cc
index b918a3a8cd84200133869229f6bce5a7cf115371..6d66dfdee18c4251944189aca36da0d08d86d51a 100644
--- a/tensorflow/core/platform/cloud/http_request_test.cc
+++ b/tensorflow/core/platform/cloud/http_request_test.cc
@@ -25,11 +25,26 @@ namespace {
 
 const string kTestContent = "random original scratch content";
 
+class FakeEnv : public EnvWrapper {
+ public:
+  FakeEnv() : EnvWrapper(Env::Default()) {}
+
+  uint64 NowSeconds() override { return now_; }
+  uint64 now_ = 10000;
+};
+
 // A fake proxy that pretends to be libcurl.
 class FakeLibCurl : public LibCurl {
  public:
   FakeLibCurl(const string& response_content, uint64 response_code)
       : response_content_(response_content), response_code_(response_code) {}
+  FakeLibCurl(const string& response_content, uint64 response_code,
+              std::vector<std::tuple<uint64, curl_off_t>> progress_ticks,
+              FakeEnv* env)
+      : response_content_(response_content),
+        response_code_(response_code),
+        progress_ticks_(std::move(progress_ticks)),
+        env_(env) {}
   FakeLibCurl(const string& response_content, uint64 response_code,
               const std::vector<string>& response_headers)
       : response_content_(response_content),
@@ -86,6 +101,9 @@ class FakeLibCurl : public LibCurl {
       case CURLOPT_READDATA:
         read_data_ = reinterpret_cast<FILE*>(param);
         break;
+      case CURLOPT_XFERINFODATA:
+        progress_data_ = param;
+        break;
       default:
         break;
     }
@@ -112,6 +130,13 @@ class FakeLibCurl : public LibCurl {
     }
     return CURLE_OK;
   }
+  CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                            int (*param)(void* clientp, curl_off_t dltotal,
+                                         curl_off_t dlnow, curl_off_t ultotal,
+                                         curl_off_t ulnow)) override {
+    progress_callback_ = param;
+    return CURLE_OK;
+  }
   CURLcode curl_easy_perform(CURL* curl) override {
     if (read_data_) {
       char buffer[3];
@@ -134,6 +159,12 @@ class FakeLibCurl : public LibCurl {
       strncpy(error_buffer_, curl_easy_perform_error_message_.c_str(),
               curl_easy_perform_error_message_.size() + 1);
     }
+    for (const auto& tick : progress_ticks_) {
+      env_->now_ = std::get<0>(tick);
+      if (progress_callback_(progress_data_, 0, std::get<1>(tick), 0, 0)) {
+        return CURLE_ABORTED_BY_CALLBACK;
+      }
+    }
     return curl_easy_perform_result_;
   }
   CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
@@ -212,10 +243,17 @@ class FakeLibCurl : public LibCurl {
   FILE* read_data_ = nullptr;
   size_t (*read_callback_)(void* ptr, size_t size, size_t nmemb,
                            FILE* userdata) = &fread;
+  int (*progress_callback_)(void* clientp, curl_off_t dltotal, curl_off_t dlnow,
+                            curl_off_t ultotal, curl_off_t ulnow) = nullptr;
+  void* progress_data_ = nullptr;
   // Outcome of performing the request.
   string posted_content_;
   CURLcode curl_easy_perform_result_ = CURLE_OK;
   string curl_easy_perform_error_message_;
+  // A vector of <timestamp, progress in bytes> pairs that represent the
+  // progress of a transmission.
+  std::vector<std::tuple<uint64, curl_off_t>> progress_ticks_;
+  FakeEnv* env_ = nullptr;
 };
 
 TEST(HttpRequestTest, GetRequest) {
@@ -547,5 +585,44 @@ TEST(HttpRequestTest, ErrorReturnsNoResponse) {
   EXPECT_EQ("", string(scratch.begin(), scratch.end()));
 }
 
+TEST(HttpRequestTest, ProgressIsOk) {
+  // Imitate a steady progress.
+  FakeEnv env;
+  FakeLibCurl libcurl(
+      "test", 200,
+      {
+          std::make_tuple(100, 0) /* timestamp 100, 0 bytes */,
+          std::make_tuple(110, 0) /* timestamp 110, 0 bytes */,
+          std::make_tuple(200, 100) /* timestamp 200, 100 bytes */
+      },
+      &env);
+  HttpRequest http_request(&libcurl, &env);
+  TF_EXPECT_OK(http_request.Init());
+  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  TF_EXPECT_OK(http_request.Send());
+}
+
+TEST(HttpRequestTest, ProgressIsStuck) {
+  // Imitate a transmission that got stuck for more than a minute.
+  FakeEnv env;
+  FakeLibCurl libcurl(
+      "test", 200,
+      {
+          std::make_tuple(100, 10) /* timestamp 100, 10 bytes */,
+          std::make_tuple(130, 10) /* timestamp 130, 10 bytes */,
+          std::make_tuple(170, 10) /* timestamp 170, 10 bytes */
+      },
+      &env);
+  HttpRequest http_request(&libcurl, &env);
+  TF_EXPECT_OK(http_request.Init());
+  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  auto status = http_request.Send();
+  EXPECT_EQ(error::UNAVAILABLE, status.code());
+  EXPECT_EQ(
+      "Error executing an HTTP request (HTTP response code 200, "
+      "error code 42, error message '')",
+      status.error_message());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index a3d5b9a6e4522780f65c6092dc8cbfa93e61b364..97d6617a040ae7d9c21c0c05b714a9f27a900aaa 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -43,7 +43,8 @@ constexpr char kJwtType[] = "JWT";
 constexpr char kGrantType[] =
     "urn%3Aietf%3Aparams%3Aoauth%3Agrant-type%3Ajwt-bearer";
 
-Status ReadJsonValue(Json::Value json, const string& name, Json::Value* value) {
+Status ReadJsonValue(const Json::Value& json, const string& name,
+                     Json::Value* value) {
   if (!value) {
     return errors::FailedPrecondition("'value' cannot be nullptr.");
   }
@@ -55,7 +56,8 @@ Status ReadJsonValue(Json::Value json, const string& name, Json::Value* value) {
   return Status::OK();
 }
 
-Status ReadJsonString(Json::Value json, const string& name, string* value) {
+Status ReadJsonString(const Json::Value& json, const string& name,
+                      string* value) {
   Json::Value json_value;
   TF_RETURN_IF_ERROR(ReadJsonValue(json, name, &json_value));
   if (!json_value.isString()) {
@@ -66,7 +68,7 @@ Status ReadJsonString(Json::Value json, const string& name, string* value) {
   return Status::OK();
 }
 
-Status ReadJsonInt(Json::Value json, const string& name, int64* value) {
+Status ReadJsonInt(const Json::Value& json, const string& name, int64* value) {
   Json::Value json_value;
   TF_RETURN_IF_ERROR(ReadJsonValue(json, name, &json_value));
   if (!json_value.isIntegral()) {
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 8bc412c5d8fba2fda900d3d6cbabc130e6de5538..9e3d5f354db4a224bd4015dc1437260b31c8face 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -58,6 +58,22 @@ cc_library(
     ],
 )
 
+# Dummy stream executor cuda plugins.
+cc_library(
+    name = "cublas_plugin",
+    srcs = [],
+)
+
+cc_library(
+    name = "cufft_plugin",
+    srcs = [],
+)
+
+cc_library(
+    name = "cudnn_plugin",
+    srcs = [],
+)
+
 # OSX framework for device driver access
 cc_library(
     name = "IOKit",
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index d7299636168a1d696da70c3e491396999ab9f06a..2fdd989c9b97497c94bb035472df910a701b2692 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <deque>
+#include <utility>
 #include <vector>
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
@@ -95,7 +96,7 @@ Status Env::GetRegisteredFileSystemSchemes(std::vector<string>* schemes) {
 
 Status Env::RegisterFileSystem(const string& scheme,
                                FileSystemRegistry::Factory factory) {
-  return file_system_registry_->Register(scheme, factory);
+  return file_system_registry_->Register(scheme, std::move(factory));
 }
 
 Status Env::NewRandomAccessFile(const string& fname,
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 3d7553e6da11aeff3e46073030eab1fdd29b001a..2abda4571458965c588b3e7bff7adc236ab2b71d 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -37,7 +37,7 @@ constexpr int kNumThreads = 8;
 
 // Run a function in parallel using a ThreadPool, but skip the ThreadPool
 // on the iOS platform due to its problems with more than a few threads.
-void ForEach(int first, int last, std::function<void(int)> f) {
+void ForEach(int first, int last, const std::function<void(int)>& f) {
 #if TARGET_OS_IPHONE
   for (int i = first; i < last; i++) {
     f(i);
diff --git a/tensorflow/core/protobuf/cluster.proto b/tensorflow/core/protobuf/cluster.proto
new file mode 100644
index 0000000000000000000000000000000000000000..33c87eefe022eee98ba7543e6623a04df540ffc9
--- /dev/null
+++ b/tensorflow/core/protobuf/cluster.proto
@@ -0,0 +1,82 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "ClusterProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.distruntime";
+
+// This file contains protos to be used when defining a TensorFlow
+// cluster.
+//
+// EXAMPLES
+// --------
+//
+// 1. A single-process cluster, containing "/job:local/task:0".
+//
+//    Cluster:
+//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' } }
+//
+//    Server:
+//      cluster { $CLUSTER } job_name: 'local' task_index: 0
+//
+// 2. A two-process cluster, containing "/job:local/task:{0,1}".
+//
+//    Cluster:
+//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' }
+//                          tasks { key: 1 value: 'localhost:2223' } }
+//
+//    Servers:
+//      cluster { $CLUSTER } job_name: 'local' task_index: 0
+//      cluster { $CLUSTER } job_name: 'local' task_index: 1
+//
+// 3. A two-job cluster, containing "/job:worker/task:{0,1,2}" and
+//    "/job:ps/task:{0,1}".
+//
+//    Cluster:
+//      job { name: 'worker' tasks { key: 0 value: 'worker1:2222' }
+//                           tasks { key: 1 value: 'worker2:2222' }
+//                           tasks { key: 2 value: 'worker3:2222' } }
+//      job { name: 'ps'     tasks { key: 0 value: 'ps0:2222' }
+//                           tasks { key: 1 value: 'ps1:2222' } }
+//
+//    Servers:
+//      cluster { $CLUSTER } job_name: 'worker' task_index: 0
+//      cluster { $CLUSTER } job_name: 'worker' task_index: 1
+//      cluster { $CLUSTER } job_name: 'worker' task_index: 2
+//      cluster { $CLUSTER } job_name: 'ps'     task_index: 0
+//      cluster { $CLUSTER } job_name: 'ps'     task_index: 1
+
+// Defines a single job in a TensorFlow cluster.
+message JobDef {
+  // The name of this job.
+  string name = 1;
+
+  // Mapping from task ID to "hostname:port" string.
+  //
+  // If the `name` field contains "worker", and the `tasks` map contains a
+  // mapping from 7 to "example.org:2222", then the device prefix
+  // "/job:worker/task:7" will be assigned to "example.org:2222".
+  map<int32, string> tasks = 2;
+}
+
+// Defines a TensorFlow cluster as a set of jobs.
+message ClusterDef {
+  // The jobs that comprise the cluster.
+  repeated JobDef job = 1;
+}
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 5c0f7232ebda65245fd4aaa83bd22f88f8f2fb3c..630f47633f87d1dfddb6eddbb18ea13a3575ddc4 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -10,6 +10,7 @@ import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
 import "tensorflow/core/protobuf/debug.proto";
+import "tensorflow/core/protobuf/cluster.proto";
 import "tensorflow/core/protobuf/rewriter_config.proto";
 
 message GPUOptions {
@@ -259,6 +260,11 @@ message ConfigProto {
 
   // Options that apply when this session uses the distributed runtime.
   RPCOptions rpc_options = 13;
+
+  // Optional list of all workers to use in this session.
+  ClusterDef cluster_def = 14;
+
+  // Next: 15
 };
 
 // Options for a single Run() call.
diff --git a/tensorflow/core/protobuf/control_flow.proto b/tensorflow/core/protobuf/control_flow.proto
index 24f42322c0fe858641f37462cd9a1475b1e48ab2..48f503225447c26f8959ba379656361292052b44 100644
--- a/tensorflow/core/protobuf/control_flow.proto
+++ b/tensorflow/core/protobuf/control_flow.proto
@@ -61,6 +61,9 @@ message WhileContextDef {
   // List of names for exit tensors.
   repeated string loop_exit_names = 8;
 
+  // List of names for enter tensors.
+  repeated string loop_enter_names = 10;
+
   // Values and external values in control flow context.
   ValuesDef values_def = 9;
 }
diff --git a/tensorflow/core/protobuf/device_properties.proto b/tensorflow/core/protobuf/device_properties.proto
new file mode 100644
index 0000000000000000000000000000000000000000..9b1497c710d40c4c5a989f80ae0d98ee2a2dc3a8
--- /dev/null
+++ b/tensorflow/core/protobuf/device_properties.proto
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "DevicePropertiesProtos";
+
+message DeviceProperties {
+  // Device type (CPU, GPU, ...)
+  string type = 1;
+  // Vendor (Intel, nvidia, ...)
+  string vendor = 2;
+  // Model (Haswell, K40, ...)
+  string model = 3;
+  // Core Frequency in Mhz
+  int64 frequency = 4;
+  // Number of cores
+  int64 num_cores = 5;
+  // Version of the tools and libraries used with this device (e.g. gcc 4.9,
+  // cudnn 5.1)
+  map<string, string> environment = 6;
+  // Number of registers per core.
+  int64 num_registers = 7;
+  // L1 cache size in bytes
+  int64 l1_cache_size = 8;
+  // L2 cache size in bytes
+  int64 l2_cache_size = 9;
+  // L3 cache size in bytes
+  int64 l3_cache_size = 10;
+  // Shared memory size per multiprocessor in bytes. This field is
+  // applicable to GPUs only.
+  int64 shared_memory_size_per_multiprocessor = 11;
+  // Memory size in bytes
+  int64 memory_size = 12;
+  // Memory bandwidth in KB/s
+  int64 bandwidth = 13;
+}
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index de91b6133e4924463ab2bbf1a5b43a1996cd13f8..e607b1c42a5b4515f3f181240460be3edaab6db9 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -38,6 +38,9 @@ message CreateSessionRequest {
 
   // Configuration options.
   ConfigProto config = 2;
+
+  // The target string used from the client's perspective.
+  string target = 3;
 }
 
 message CreateSessionResponse {
diff --git a/tensorflow/core/protobuf/tensorflow_server.proto b/tensorflow/core/protobuf/tensorflow_server.proto
index c4077bd98e452f9ed2338ada29bfd5400ebbeff3..6199e707e5ad034d9888daa49c13bd87b02b171c 100644
--- a/tensorflow/core/protobuf/tensorflow_server.proto
+++ b/tensorflow/core/protobuf/tensorflow_server.proto
@@ -16,6 +16,7 @@ limitations under the License.
 syntax = "proto3";
 
 import "tensorflow/core/protobuf/config.proto";
+import "tensorflow/core/protobuf/cluster.proto";
 
 package tensorflow;
 option cc_enable_arenas = true;
@@ -23,69 +24,6 @@ option java_outer_classname = "ServerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
 
-// This file contains protos to be used when defining a TensorFlow
-// cluster, and a server within that cluster.
-//
-// EXAMPLES
-// --------
-//
-// 1. A single-process cluster, containing "/job:local/task:0".
-//
-//    Cluster:
-//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' } }
-//
-//    Server:
-//      cluster { $CLUSTER } job_name: 'local' task_index: 0
-//
-// 2. A two-process cluster, containing "/job:local/task:{0,1}".
-//
-//    Cluster:
-//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' }
-//                          tasks { key: 1 value: 'localhost:2223' } }
-//
-//    Servers:
-//      cluster { $CLUSTER } job_name: 'local' task_index: 0
-//      cluster { $CLUSTER } job_name: 'local' task_index: 1
-//
-// 3. A two-job cluster, containing "/job:worker/task:{0,1,2}" and
-//    "/job:ps/task:{0,1}".
-//
-//    Cluster:
-//      job { name: 'worker' tasks { key: 0 value: 'worker1:2222' }
-//                           tasks { key: 1 value: 'worker2:2222' }
-//                           tasks { key: 2 value: 'worker3:2222' } }
-//      job { name: 'ps'     tasks { key: 0 value: 'ps0:2222' }
-//                           tasks { key: 1 value: 'ps1:2222' } }
-//
-//    Servers:
-//      cluster { $CLUSTER } job_name: 'worker' task_index: 0
-//      cluster { $CLUSTER } job_name: 'worker' task_index: 1
-//      cluster { $CLUSTER } job_name: 'worker' task_index: 2
-//      cluster { $CLUSTER } job_name: 'ps'     task_index: 0
-//      cluster { $CLUSTER } job_name: 'ps'     task_index: 1
-
-// Defines a single job in a TensorFlow cluster.
-message JobDef {
-  // The name of this job.
-  string name = 1;
-
-  // Mapping from task ID to "hostname:port" string.
-  //
-  // If the `name` field contains "worker", and the `tasks` map contains a
-  // mapping from 7 to "example.org:2222", then the device prefix
-  // "/job:worker/task:7" will be assigned to "example.org:2222".
-  //
-  // NOTE(mrry): Currently, only a dense task ID space starting at 0 is
-  // supported.
-  map<int32, string> tasks = 2;
-}
-
-// Defines a TensorFlow cluster as a set of jobs.
-message ClusterDef {
-  // The jobs that comprise the cluster.
-  repeated JobDef job = 1;
-}
-
 // Defines the configuration of a single TensorFlow server.
 message ServerDef {
   // The cluster of which this server is a member.
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index e3af1119e96b0522512e6287cf736d70e2fb7659..cf05aece39a1b9c23fe1c4597177655659182e15 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -28,6 +28,7 @@ import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/protobuf/config.proto";
+import "tensorflow/core/protobuf/debug.proto";
 import "tensorflow/core/protobuf/named_tensor.proto";
 import "tensorflow/core/protobuf/tensorflow_server.proto";
 
@@ -92,6 +93,9 @@ message RegisterGraphRequest {
 
   // Configuration options for the session in which this graph was created.
   GraphOptions graph_options = 4;
+
+  // Field(s) used by TensorFlow Debugger (tfdbg).
+  DebugOptions debug_options = 5;
 }
 
 message RegisterGraphResponse {
@@ -115,6 +119,10 @@ message RegisterGraphResponse {
 ////////////////////////////////////////////////////////////////////////////////
 
 message DeregisterGraphRequest {
+  // The session_handle used when registering the graph. If session_handle is
+  // empty, a single global namespace is used.
+  string session_handle = 2;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -163,6 +171,12 @@ message ExecutorOpts {
 };
 
 message RunGraphRequest {
+  // session_handle is the the master-generated unique id for this session.
+  // If session_handle is non-empty, it must be the same as used when
+  // registering the graph. If it is empty, a single global namespace is used to
+  // search for the graph_handle.
+  string session_handle = 8;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -189,6 +203,8 @@ message RunGraphRequest {
   bool is_partial = 6;
   // True if this is the last partial run request in a sequence of requests.
   bool is_last_partial_run = 7;
+
+  // Next: 9
 }
 
 message RunGraphResponse {
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h
index eacadd65af8aaf00ff206406092f25ca6711ead8..567bad38c33b5c4b633646a47a600e4406164444 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.h
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -65,7 +65,8 @@ class CTCLossCalculator {
   Status CalculateLoss(const VectorIn& seq_len, const LabelSequences& labels,
                        const std::vector<MatrixIn>& inputs,
                        bool preprocess_collapse_repeated,
-                       bool ctc_merge_repeated, VectorOut* loss,
+                       bool ctc_merge_repeated,
+                       bool ignore_longer_outputs_than_inputs, VectorOut* loss,
                        std::vector<MatrixOut>* gradients,
                        DeviceBase::CpuWorkerThreads* workers = nullptr) const;
 
@@ -90,7 +91,8 @@ class CTCLossCalculator {
   // batch.  Return value:
   //    max_{b in batch_size} l_primes[b].size()
   template <typename Vector>
-  Status PopulateLPrimes(bool preprocess_collapse_repeated, int batch_size,
+  Status PopulateLPrimes(bool preprocess_collapse_repeated,
+                         bool ignore_longer_outputs_than_inputs, int batch_size,
                          int num_classes, const Vector& seq_len,
                          const LabelSequences& labels, size_t* max_u_prime,
                          LabelSequences* l_primes) const;
@@ -108,7 +110,8 @@ template <typename VectorIn, typename VectorOut, typename MatrixIn,
 Status CTCLossCalculator::CalculateLoss(
     const VectorIn& seq_len, const LabelSequences& labels,
     const std::vector<MatrixIn>& inputs, bool preprocess_collapse_repeated,
-    bool ctc_merge_repeated, VectorOut* loss, std::vector<MatrixOut>* gradients,
+    bool ctc_merge_repeated, bool ignore_longer_outputs_than_inputs,
+    VectorOut* loss, std::vector<MatrixOut>* gradients,
     DeviceBase::CpuWorkerThreads* workers) const {
   auto num_time_steps = inputs.size();
 
@@ -155,20 +158,31 @@ Status CTCLossCalculator::CalculateLoss(
   // and calculate the maximum necessary allocation size.
   LabelSequences l_primes(batch_size);
   size_t max_u_prime = 0;
-  Status l_p_ret =
-      PopulateLPrimes(preprocess_collapse_repeated, batch_size, num_classes,
-                      seq_len, labels, &max_u_prime, &l_primes);
+  Status l_p_ret = PopulateLPrimes(
+      preprocess_collapse_repeated, ignore_longer_outputs_than_inputs,
+      batch_size, num_classes, seq_len, labels, &max_u_prime, &l_primes);
   if (!l_p_ret.ok()) {
     return l_p_ret;
   }
 
   // Process each item in a batch in parallel, using at most kMaxThreads.
-  auto ComputeLossAndGradients = [this, num_classes, &l_primes, &seq_len,
-                                  &inputs, requires_backprop,
-                                  ctc_merge_repeated, &loss, &gradients](
-      int64 start_row, int64 limit_row) {
+  auto ComputeLossAndGradients = [this, num_classes, &labels, &l_primes,
+                                  &seq_len, &inputs, requires_backprop,
+                                  ctc_merge_repeated,
+                                  ignore_longer_outputs_than_inputs, &loss,
+                                  &gradients](int64 start_row,
+                                              int64 limit_row) {
     for (int b = start_row; b < limit_row; b++) {
-      if (seq_len(b) == 0) {
+      // Return zero gradient for empty sequences or sequences with labels
+      // longer than input, which is not supported by CTC.
+      if (seq_len(b) == 0 ||
+          (ignore_longer_outputs_than_inputs &&
+           labels[b].size() > seq_len(b) - this->output_delay_)) {
+        VLOG(1) << "The sequence length is either zero or shorter than the "
+                   "target output (CTC works only with shorter target sequence "
+                   "than input sequence). You can turn this into a warning by "
+                   "using the flag ignore_longer_outputs_than_inputs - "
+                << b << ": " << str_util::Join(labels[b], " ");
         continue;
       }
 
@@ -263,12 +277,11 @@ Status CTCLossCalculator::CalculateLoss(
 }
 
 template <typename Vector>
-Status CTCLossCalculator::PopulateLPrimes(bool preprocess_collapse_repeated,
-                                          int batch_size, int num_classes,
-                                          const Vector& seq_len,
-                                          const LabelSequences& labels,
-                                          size_t* max_u_prime,
-                                          LabelSequences* l_primes) const {
+Status CTCLossCalculator::PopulateLPrimes(
+    bool preprocess_collapse_repeated, bool ignore_longer_outputs_than_inputs,
+    int batch_size, int num_classes, const Vector& seq_len,
+    const LabelSequences& labels, size_t* max_u_prime,
+    LabelSequences* l_primes) const {
   // labels is a Label array of size batch_size
   if (labels.size() != batch_size) {
     return errors::InvalidArgument("labels.size() != batch_size: ",
@@ -311,9 +324,6 @@ Status CTCLossCalculator::PopulateLPrimes(bool preprocess_collapse_repeated,
       }
     }
 
-    // Make sure there is enough time to output the target indices.
-    int time = seq_len(b) - output_delay_;
-    int required_time = label.size();
     for (int l_i : l) {
       if (l_i < 0) {
         return errors::InvalidArgument(
@@ -325,14 +335,19 @@ Status CTCLossCalculator::PopulateLPrimes(bool preprocess_collapse_repeated,
             num_classes, ", batch: ", b, " labels: ", str_util::Join(l, ","));
       }
     }
-    if (required_time > time) {
-      return errors::InvalidArgument(
-          "Not enough time for target transition sequence ("
-          "required: ",
-          required_time, ", available: ", time,
-          "), skipping data instance in batch: ", b);
+    if (!ignore_longer_outputs_than_inputs) {
+      // Make sure there is enough time to output the target indices.
+      int time = seq_len(b) - output_delay_;
+      int required_time = label.size();
+      if (required_time > time) {
+        return errors::InvalidArgument(
+            "Not enough time for target transition sequence ("
+            "required: ",
+            required_time, ", available: ", time, ")", b,
+            "You can turn this error into a warning by using the flag "
+            "ignore_longer_outputs_than_inputs");
+      }
     }
-
     // Target indices with blanks before each index and a blank at the end.
     // Length U' = 2U + 1.
     // Convert l to l_prime
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 8a3f6c587ed91f51835032465f74eb5bfa27e97f..46ea68687c7e7c7ce495992d59762dac188b7857 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -63,6 +63,28 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
   return config;
 }
 
+// Calculate the Cuda launch config we should use for a kernel launch. This
+// variant takes the resource limits of func into account to maximize occupancy.
+template <typename DeviceFunc>
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const GPUDevice& d, DeviceFunc func,
+                                            size_t dynamic_shared_memory_size) {
+  int block_count = 0;
+  int thread_per_block = 0;
+  cudaOccupancyMaxPotentialBlockSize(&block_count, &thread_per_block, func,
+                                     dynamic_shared_memory_size,
+                                     work_element_count);
+  block_count =
+      std::min(block_count,
+               (work_element_count + thread_per_block - 1) / thread_per_block);
+
+  CudaLaunchConfig config;
+  config.virtual_thread_count = work_element_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
 struct Cuda2DLaunchConfig {
   dim3 virtual_thread_count;
   dim3 thread_per_block;
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 897b174eff278bd5be4234fbc20e14feca0b80dc..6a37256ea9f0827488d10bfbee1faa454e1825a8 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -542,8 +542,8 @@ inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) {
   return mkl_shape.dim_size(index);
 }
 
-inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
-                                 int idx_out) {
+inline void CopyMklTensorInToOut(OpKernelContext* context,
+                                 int idx_in, int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -563,8 +563,9 @@ inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
   context->set_output(idx_meta_out, meta_output);
 }
 
-inline void CopyTFTensorInToOut(OpKernelContext* context, int idx_in,
-                                int idx_out, const TensorShape& shape) {
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
+                                         int idx_in, int idx_out,
+                                         const TensorShape& shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -580,6 +581,41 @@ inline void CopyTFTensorInToOut(OpKernelContext* context, int idx_in,
   context->set_output(idx_data_out, output);
 }
 
+inline void FowardTfTensorInToOut(OpKernelContext* context,
+                                  int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  MklShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+
+inline void ForwarMklTensorInToOut(OpKernelContext* context,
+                                   int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_meta_in = GetTensorMetaDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+  int idx_meta_out = GetTensorMetaDataIndex(idx_out, num_outputs);
+
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+    context->forward_ref_input_to_ref_output(idx_meta_in, idx_meta_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+    context->set_output(idx_meta_out, context->input(idx_meta_in));
+  }
+}
+
 namespace mkl_op_registry {
 static const char* kMklOpLabel = "MklOp";
 static const char* kMklOpLabelPattern = "label='MklOp'";
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index 8300ba370ebf839f4d7ea43881baa6e33aba3973..40ae85e44b0a18f5c113fa25f49237ea3266ba78 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -50,20 +50,23 @@ void StatSummarizer::Validate(const Detail* detail,
       }
       const auto& stored = detail->outputs[slot];
       const auto& current = output.tensor_description();
-      bool do_shapes_match = true;
-      if (stored.shape().dim_size() != current.shape().dim_size()) {
-        do_shapes_match = false;
-      } else {
+
+      bool do_tensors_match =
+          (stored.dtype() == current.dtype()) &&
+          (stored.shape().dim_size() == current.shape().dim_size());
+
+      if (do_tensors_match) {
         for (int i = 0; i < stored.shape().dim_size(); ++i) {
           if (stored.shape().dim(i).size() != current.shape().dim(i).size()) {
-            do_shapes_match = false;
+            do_tensors_match = false;
+            break;
           }
         }
+      }
 
-        if ((stored.dtype() != current.dtype()) || !do_shapes_match) {
-          LOG(WARNING) << "Output tensor changed between runs for '"
-                       << ns.node_name();
-        }
+      if (!do_tensors_match) {
+        LOG(WARNING) << "Output tensor changed between runs for '"
+                     << ns.node_name();
       }
     }
   }
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index 3ca243966f7ed3be294e6b2fa54d4570ad0f5af4..b0bddf7e423391c91ed79ef311866630bcf53c79 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -212,6 +212,10 @@ class BundleReader {
 
   // Looks up the slices of the tensor keyed by "key".  On OK, "slices"
   // is non-empty if and only if the tensor is a partitioned tensor.
+  //
+  // Warning - there is no guaranteed ordering for the returned slices, so
+  // a slice with a larger start index in some dimension could come before
+  // another slice with a smaller start index in the same dimension.
   // REQUIRES: status().ok()
   Status LookupTensorSlices(StringPiece key, std::vector<TensorSlice>* slices)
       TF_MUST_USE_RESULT;
diff --git a/tensorflow/core/util/tensor_slice_reader.cc b/tensorflow/core/util/tensor_slice_reader.cc
index e750b130b9e4f9687a8c43c73fe00def4166f235..cd4903471963e703f5ef2b7654a3c40418cbbb08 100644
--- a/tensorflow/core/util/tensor_slice_reader.cc
+++ b/tensorflow/core/util/tensor_slice_reader.cc
@@ -102,7 +102,8 @@ TensorSliceReader::TensorSliceReader(const string& filepattern)
 
 TensorSliceReader::TensorSliceReader(const string& filepattern,
                                      OpenTableFunction open_function)
-    : TensorSliceReader(filepattern, open_function, kLoadAllShards) {}
+    : TensorSliceReader(filepattern, std::move(open_function), kLoadAllShards) {
+}
 
 TensorSliceReader::TensorSliceReader(const string& filepattern,
                                      OpenTableFunction open_function,
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.cc b/tensorflow/core/util/tensor_slice_reader_cache.cc
index cbd2922f543a9b3dea16612bfdc3dc136368b04d..0f009d7de57a3cf1471c1ba694d3a771bc00635c 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.cc
+++ b/tensorflow/core/util/tensor_slice_reader_cache.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
+#include <utility>
+
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -36,7 +38,8 @@ const TensorSliceReader* TensorSliceReaderCacheWrapper::GetReader(
   if (!cache_) {
     cache_ = new TensorSliceReaderCache;
   }
-  return cache_->GetReader(filepattern, open_function, preferred_shard);
+  return cache_->GetReader(filepattern, std::move(open_function),
+                           preferred_shard);
 }
 
 TensorSliceReaderCache::TensorSliceReaderCache() {}
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index 854569788617e85427bac98ca941c3e0bbc5afab..f4859262e12c3560703eb7daa83c970d352eb2d7 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
 #include "tensorflow/core/framework/types.h"
@@ -48,8 +50,9 @@ namespace {
 //
 // We assume this is a row-major matrix.
 
-void SimpleFloatHelper(TensorSliceWriter::CreateBuilderFunction create_function,
-                       TensorSliceReader::OpenTableFunction open_function) {
+void SimpleFloatHelper(
+    const TensorSliceWriter::CreateBuilderFunction& create_function,
+    TensorSliceReader::OpenTableFunction open_function) {
   const string fname_base = io::JoinPath(testing::TmpDir(), "float_checkpoint");
 
   TensorShape shape({4, 5});
@@ -108,7 +111,7 @@ void SimpleFloatHelper(TensorSliceWriter::CreateBuilderFunction create_function,
 
   // Now we need to read the tensor slices
   const string filepattern = strings::StrCat(fname_base, "_*");
-  TensorSliceReader reader(filepattern, open_function);
+  TensorSliceReader reader(filepattern, std::move(open_function));
   TF_EXPECT_OK(reader.status());
   EXPECT_EQ(2, reader.num_files());
 
@@ -171,9 +174,10 @@ TEST(TensorSliceReaderTest, SimpleFloat) {
 }
 
 template <typename T, typename U>
-void SimpleIntXHelper(TensorSliceWriter::CreateBuilderFunction create_function,
-                      TensorSliceReader::OpenTableFunction open_function,
-                      const string& checkpoint_file) {
+void SimpleIntXHelper(
+    const TensorSliceWriter::CreateBuilderFunction& create_function,
+    TensorSliceReader::OpenTableFunction open_function,
+    const string& checkpoint_file) {
   const string fname_base = io::JoinPath(testing::TmpDir(), checkpoint_file);
 
   TensorShape shape({4, 5});
@@ -232,7 +236,7 @@ void SimpleIntXHelper(TensorSliceWriter::CreateBuilderFunction create_function,
 
   // Now we need to read the tensor slices
   const string filepattern = strings::StrCat(fname_base, "_*");
-  TensorSliceReader reader(filepattern, open_function);
+  TensorSliceReader reader(filepattern, std::move(open_function));
   TF_EXPECT_OK(reader.status());
   EXPECT_EQ(2, reader.num_files());
 
@@ -304,8 +308,8 @@ TEST_SIMPLE_INT(int8, int32)
 TEST_SIMPLE_INT(uint8, int32)
 
 void CachedTensorSliceReaderTesterHelper(
-    TensorSliceWriter::CreateBuilderFunction create_function,
-    TensorSliceReader::OpenTableFunction open_function) {
+    const TensorSliceWriter::CreateBuilderFunction& create_function,
+    const TensorSliceReader::OpenTableFunction& open_function) {
   const string fname_base = io::JoinPath(testing::TmpDir(), "float_checkpoint");
 
   TensorShape shape({4, 5});
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijector.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijector.md
deleted file mode 100644
index 16a47bfd8b111740e7995c06587df98566adbce4..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijector.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Random variable transformations (contrib)
-[TOC]
-
-Bijector Ops.
-
-An API for invertible, differentiable transformations of random variables.
-
-## Background
-
-Differentiable, bijective transformations of continuous random variables alter
-the calculations made in the cumulative/probability distribution functions and
-sample function.  This module provides a standard interface for making these
-manipulations.
-
-For more details and examples, see the `Bijector` docstring.
-
-To apply a `Bijector`, use `distributions.TransformedDistribution`.
-
-## Bijectors
-
-*   @{tf.contrib.distributions.bijector.Affine}
-*   @{tf.contrib.distributions.bijector.AffineLinearOperator}
-*   @{tf.contrib.distributions.bijector.Bijector}
-*   @{tf.contrib.distributions.bijector.Chain}
-*   @{tf.contrib.distributions.bijector.CholeskyOuterProduct}
-*   @{tf.contrib.distributions.bijector.Exp}
-*   @{tf.contrib.distributions.bijector.Identity}
-*   @{tf.contrib.distributions.bijector.Inline}
-*   @{tf.contrib.distributions.bijector.Invert}
-*   @{tf.contrib.distributions.bijector.PowerTransform}
-*   @{tf.contrib.distributions.bijector.SigmoidCentered}
-*   @{tf.contrib.distributions.bijector.SoftmaxCentered}
-*   @{tf.contrib.distributions.bijector.Softplus}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ce187b329bce38fe096f2640a09cc93c71f9543
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
@@ -0,0 +1,33 @@
+# Random variable transformations (contrib)
+[TOC]
+
+Bijector Ops.
+
+An API for invertible, differentiable transformations of random variables.
+
+## Background
+
+Differentiable, bijective transformations of continuous random variables alter
+the calculations made in the cumulative/probability distribution functions and
+sample function.  This module provides a standard interface for making these
+manipulations.
+
+For more details and examples, see the `Bijector` docstring.
+
+To apply a `Bijector`, use `distributions.TransformedDistribution`.
+
+## Bijectors
+
+*   @{tf.contrib.distributions.bijectors.Affine}
+*   @{tf.contrib.distributions.bijectors.AffineLinearOperator}
+*   @{tf.contrib.distributions.bijectors.Bijector}
+*   @{tf.contrib.distributions.bijectors.Chain}
+*   @{tf.contrib.distributions.bijectors.CholeskyOuterProduct}
+*   @{tf.contrib.distributions.bijectors.Exp}
+*   @{tf.contrib.distributions.bijectors.Identity}
+*   @{tf.contrib.distributions.bijectors.Inline}
+*   @{tf.contrib.distributions.bijectors.Invert}
+*   @{tf.contrib.distributions.bijectors.PowerTransform}
+*   @{tf.contrib.distributions.bijectors.SigmoidCentered}
+*   @{tf.contrib.distributions.bijectors.SoftmaxCentered}
+*   @{tf.contrib.distributions.bijectors.Softplus}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
index 2b43e1281d73068750fbdabaa19618d470d4e803..7a3d509b75198461430195aa70a336f94b7f8cfa 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
@@ -76,7 +76,7 @@ representing the posterior or posterior predictive.
 
 ## Kullback-Leibler Divergence
 
-*   @{tf.contrib.distributions.kl}
+*   @{tf.contrib.distributions.kl_divergence}
 *   @{tf.contrib.distributions.RegisterKL}
 
 ## Utilities
diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
index f6116240792455585f76c5a7e0498c89b51707da..de4f126507930331d348cc795bd03b9971778d07 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
@@ -137,16 +137,16 @@ which to operate must always be given explicitly. This is the reason why
 
 ## Module: reroute
 
-*   @{tf.contrib.graph_editor.reroute.swap_ts}
-*   @{tf.contrib.graph_editor.reroute.reroute_ts}
-*   @{tf.contrib.graph_editor.reroute.swap_inputs}
-*   @{tf.contrib.graph_editor.reroute.reroute_inputs}
-*   @{tf.contrib.graph_editor.reroute.swap_outputs}
-*   @{tf.contrib.graph_editor.reroute.reroute_outputs}
-*   @{tf.contrib.graph_editor.reroute.swap_ios}
-*   @{tf.contrib.graph_editor.reroute.reroute_ios}
-*   @{tf.contrib.graph_editor.reroute.remove_control_inputs}
-*   @{tf.contrib.graph_editor.reroute.add_control_inputs}
+*   @{tf.contrib.graph_editor.swap_ts}
+*   @{tf.contrib.graph_editor.reroute_ts}
+*   @{tf.contrib.graph_editor.swap_inputs}
+*   @{tf.contrib.graph_editor.reroute_inputs}
+*   @{tf.contrib.graph_editor.swap_outputs}
+*   @{tf.contrib.graph_editor.reroute_outputs}
+*   @{tf.contrib.graph_editor.swap_ios}
+*   @{tf.contrib.graph_editor.reroute_ios}
+*   @{tf.contrib.graph_editor.remove_control_inputs}
+*   @{tf.contrib.graph_editor.add_control_inputs}
 
 ## Module: edit
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.integrate.md b/tensorflow/docs_src/api_guides/python/contrib.integrate.md
index e6b730b2035a7aec4ff612e7721ac18a2d642508..e95b5a2e68685fc4828eb64fbc3e363d8a1add31 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.integrate.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.integrate.md
@@ -33,7 +33,7 @@ plt.plot(x, z)
 ```
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/lorenz_attractor.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/lorenz_attractor.png" alt>
 </div>
 
 ## Ops
diff --git a/tensorflow/docs_src/api_guides/python/contrib.layers.md b/tensorflow/docs_src/api_guides/python/contrib.layers.md
index a829c0a02cab2f63852d562a1e38152eec2f19bd..d4cda3a25454cd1db344c10ed07ada520bd45da9 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.layers.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.layers.md
@@ -20,7 +20,6 @@ common machine learning algorithms.
 *   @{tf.contrib.layers.flatten}
 *   @{tf.contrib.layers.fully_connected}
 *   @{tf.contrib.layers.layer_norm}
-*   @{tf.contrib.layers.linear}
 *   @{tf.contrib.layers.max_pool2d}
 *   @{tf.contrib.layers.one_hot_encoding}
 *   @{tf.nn.relu}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.linalg.md b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
index efc2d76ef1ef042a55e5a483976bfb1b8e4764f4..b2c7fcf6bbac58ea782c73d9651c0554d2ba1e8f 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.linalg.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
@@ -21,7 +21,7 @@ Subclasses of `LinearOperator` provide a access to common methods on a
 *   @{tf.contrib.linalg.LinearOperatorDiag}
 *   @{tf.contrib.linalg.LinearOperatorIdentity}
 *   @{tf.contrib.linalg.LinearOperatorScaledIdentity}
-*   @{tf.contrib.linalg.LinearOperatorMatrix}
+*   @{tf.contrib.linalg.LinearOperatorFullMatrix}
 *   @{tf.contrib.linalg.LinearOperatorTriL}
 *   @{tf.contrib.linalg.LinearOperatorUDVHUpdate}
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
index cb93f9d549a76ced233693666b1fe3186bfcd435..8c289dd55631a94546aeab129edf4d530eecaeda 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.losses.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -13,8 +13,8 @@ of samples in the batch and `d1` ... `dN` are the remaining dimensions.
 It is common, when training with multiple loss functions, to adjust the relative
 strengths of individual losses. This is performed by rescaling the losses via
 a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and sum_of_squares_loss, and we wished that the
-log_loss penalty be twice as severe as the sum_of_squares_loss, we would
+training with both log_loss and mean_square_error, and we wished that the
+log_loss penalty be twice as severe as the mean_square_error, we would
 implement this as:
 
 ```python
@@ -22,7 +22,7 @@ implement this as:
   tf.contrib.losses.log(predictions, labels, weight=2.0)
 
   # Uses default weight of 1.0
-  tf.contrib.losses.sum_of_squares(predictions, labels)
+  tf.contrib.losses.mean_square_error(predictions, labels)
 
   # All the losses are collected into the `GraphKeys.LOSSES` collection.
   losses = tf.get_collection(tf.GraphKeys.LOSSES)
@@ -74,7 +74,7 @@ these predictions.
   predictions = MyModelPredictions(images)
 
   weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
+  loss  = tf.contrib.losses.mean_square_error(predictions, depths, weight)
 ```
 
 Note that when using weights for the losses, the final average is computed
@@ -100,7 +100,7 @@ weighted average over the individual prediction errors:
 
   weight = MyComplicatedWeightingFunction(labels)
   weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
+  loss = tf.contrib.losses.mean_square_error(predictions, depths, weight)
 ```
 
 @{tf.contrib.losses.absolute_difference}
@@ -118,9 +118,4 @@ weighted average over the individual prediction errors:
 @{tf.contrib.losses.softmax_cross_entropy}
 @{tf.contrib.losses.sparse_softmax_cross_entropy}
 
-The following are deprecated in favor of `mean_pairwise_squared_error` and
-`mean_squared_error`.
-@{tf.contrib.losses.sum_of_pairwise_squares}
-@{tf.contrib.losses.sum_of_squares}
-
 
diff --git a/tensorflow/docs_src/api_guides/python/index.md b/tensorflow/docs_src/api_guides/python/index.md
index 177f19bc80da0a91beb2ba225b13ac724222c19c..19d50926d8821c17350d57909a8830c4cf00ba0a 100644
--- a/tensorflow/docs_src/api_guides/python/index.md
+++ b/tensorflow/docs_src/api_guides/python/index.md
@@ -40,7 +40,7 @@
 *   [Losses (contrib)](contrib.losses.md)
 *   [Metrics (contrib)](contrib.metrics.md)
 *   [Optimization (contrib)](contrib.opt.md)
-*   [Random variable transformations (contrib)](contrib.distributions.bijector.md)
+*   [Random variable transformations (contrib)](contrib.distributions.bijectors.md)
 *   [RNN and Cells (contrib)](contrib.rnn.md)
 *   [Seq2seq Library (contrib)](contrib.seq2seq.md)
 *   [Staging (contrib)](contrib.staging.md)
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index be18d3e8be3dec50682971f4b23eae833e51c6b2..31a10d1f15d437810a9acb04d34bdae97195223c 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -4,12 +4,24 @@ We welcome contributions to the Tensorflow documentation from the community.
 This document explains how you can contribute to that documentation. In
 particular, this document explains the following:
 
-- Where the documentation is located.
-- How to make conformant edits.
-- How to build and test your documentation changes before you submit them.
+* Where the documentation is located.
+* How to make conformant edits.
+* How to build and test your documentation changes before you submit them.
 
-You can view Tensorflow documentation on tensorflow.org, and you can view and
-edit the raw files on Github.
+You can view Tensorflow documentation on https://www.tensorflow.org, and you
+can view and edit the raw files on Github. We're publishing our docs on Github
+so everybody can contribute. Whatever gets checked in tensorflow/docs_src will
+be published soon after on https://www.tensorflow.org. 
+
+Republishing TensorFlow documentation in different forms is absolutely allowed,
+but we are unlikely to accept other documentation formats (or the tooling to
+generate them) into our repository. If you do choose to republish our
+documentation in another form, please be sure to include:
+
+* The version of the API this represents (i.e. r1.0, master, etc.)
+* The commit or version from which the documentation was generated
+* Where to get the latest documentation (that is, https://www.tensorflow.org)
+* The Apache 2.0 license.
 
 ## A Note on Versions
 
@@ -166,7 +178,7 @@ tensorflow`).  Run the following command to compile TensorFlow and generate the
 documentation in the `/tmp/tfdocs` dir:
 
     bazel run tools/docs:generate -- \
-              --src_dir=`pwd`/tensorflow/docs_src/ \
+              --src_dir="$(pwd)/docs_src/" \
               --output_dir=/tmp/tfdocs/
 
 Note: You must set `src_dir` and `output_dir` to absolute file paths.
diff --git a/tensorflow/docs_src/community/style_guide.md b/tensorflow/docs_src/community/style_guide.md
index 767e33c3d07c031525a40fbb4d605437a83bbde0..f90a6cf938dcbdc83971a68cf28ae8722d4537fb 100644
--- a/tensorflow/docs_src/community/style_guide.md
+++ b/tensorflow/docs_src/community/style_guide.md
@@ -162,9 +162,9 @@ operation.
   - `reuse`: `bool` indicator if the variable should be reused if
              it's present in the scope.
 
-* Layers that behave differently during training should have:
-  - `is_training`: `bool` to indicate if a training graph is been built.
-
+* Layers that behave differently during training should take:
+  - `is_training`: `bool` indicator to conditionally choose different 
+                   computation paths (e.g. using `tf.cond`) during execution.
 
 Example:
 
diff --git a/tensorflow/docs_src/deploy/distributed.md b/tensorflow/docs_src/deploy/distributed.md
index cdfb4672fa023ff2996cbe2def9dcc7006c9b69e..99390f7416c87ea76fae1469797f53073ef77aca 100644
--- a/tensorflow/docs_src/deploy/distributed.md
+++ b/tensorflow/docs_src/deploy/distributed.md
@@ -178,7 +178,7 @@ simplify the work of specifying a replicated model. Possible approaches include:
   values for the current parameters, compute gradients in parallel, and then
   apply them together. It is compatible with in-graph replication (e.g. using
   gradient averaging as in the
-  [CIFAR-10 multi-GPU trainer](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py)),
+  [CIFAR-10 multi-GPU trainer](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py)),
   and between-graph replication (e.g. using the
   @{tf.train.SyncReplicasOptimizer}).
 
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index f54f79cbf4a9a799669d772e5144303e130d0a8f..a8c28e98c9b20c35f30192907b84abdbf4860c81 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -42,7 +42,7 @@ To incorporate your custom op you'll need to:
     Python @{tf.test.compute_gradient_error$gradient checker}.
     See
     [`relu_op_test.py`](https://www.tensorflow.org/code/tensorflow/python/kernel_tests/relu_op_test.py) as
-    an example that does tests the forward functions of Relu-like operators and
+    an example that tests the forward functions of Relu-like operators and
     their gradients.
 
 PREREQUISITES:
@@ -152,6 +152,163 @@ REGISTER_KERNEL_BUILDER(Name("ZeroOut").Device(DEVICE_CPU), ZeroOutOp);
 >   Consider using a [`ResourceMgr`](https://www.tensorflow.org/code/tensorflow/core/framework/resource_mgr.h)
 >   to keep track of op state.
 
+### Multi-threaded CPU kernels
+
+To write a multi-threaded CPU kernel, the Shard function in
+[`work_sharder.h`](https://www.tensorflow.org/code/tensorflow/core/framework/work_sharder.h)
+can be used. This function shards a computation function across the
+threads configured to be used for intra-op threading (see
+intra_op_parallelism_threads in
+[`config.proto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)).
+
+### GPU kernels
+
+A GPU kernel is implemented in two parts: the OpKernel and the CUDA kernel and
+its launch code.
+
+Sometimes the OpKernel implementation is common between a CPU and GPU kernel,
+such as around inspecting inputs and allocating outputs.  In that case, a
+suggested implementation is to:
+
+1. Define the OpKernel templated on the Device and the primitive type of the
+   tensor.
+2. To do the actual computation of the output, the Compute function calls a
+    templated functor struct.
+3. The specialization of that functor for the CPUDevice is defined in the same
+   file, but the specialization for the GPUDevice is defined in a .cu.cc file,
+   since it will be compiled with the CUDA compiler.
+
+<!--zippy-->
+
+Expand this to see the example implementation.
+
+```c++
+// example.h
+#ifndef KERNEL_EXAMPLE_H_
+#define KERNEL_EXAMPLE_H_
+
+template <typename Device, typename T>
+struct ExampleFunctor {
+  void operator()(const Device& d, int size, const T* in, T* out);
+};
+
+#endif KERNEL_EXAMPLE_H_
+```
+
+```c++
+// example.cc
+#define EIGEN_USE_THREADS
+#include "example.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+// CPU specialization of actual computation.
+template <typename T>
+struct ExampleFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, int size, const T* in, T* out) {
+    for (int i = 0; i < size; ++i) {
+      out[i] = 2 * in[i];
+    }
+  }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class ExampleOp : public OpKernel {
+ public:
+  explicit ExampleOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input_tensor = context->input(0);
+
+    // Create an output tensor
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
+                                                     &output_tensor));
+
+    // Do the computation.
+    OP_REQUIRES(context, input_tensor.NumElements() <= tensorflow::kint32max,
+                errors::InvalidArgument("Too many elements in tensor"));
+    ExampleFunctor<Device, T>()(
+        context->eigen_device<Device>(),
+        static_cast<int>(input_tensor.NumElements()),
+        input_tensor.flat<T>().data(),
+        output_tensor->flat<T>().data());
+  }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Example").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      ExampleOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(int32);
+
+// Register the GPU kernels.
+#ifdef GOOGLE_CUDA
+#define REGISTER_GPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Example").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      ExampleOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(int32);
+#endif  // GOOGLE_CUDA
+```
+
+```c++
+#ifdef GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+#define EIGEN_USE_THREADS
+
+#include "example.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+using namespace tensorflow;
+
+#define EIGEN_USE_GPU
+
+// Define the CUDA kernel.
+template <typename T>
+__global__ void ExampleCudaKernel(const int size, const T* in, T* out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    out[i] = 2 * ldg(in + i);
+  }
+}
+
+// Define the GPU implementation that launches the CUDA kernel.
+template <typename T>
+struct ExampleFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, int size, const T* in, T* out) {
+    // Launch the cuda kernel.
+    //
+    // See core/util/cuda_kernel_helper.h for example of computing
+    // block count and thread_per_block count.
+    int block_count = 1024;
+    int thread_per_block = 20;
+    ExampleCudaKernel<T>
+        <<<block_count, thread_per_block, 0, d.stream()>>>(size, in, out);
+  }
+};
+
+// Instantiate functors for the types of OpKernels registered.
+typedef Eigen::GpuDevice GPUDevice;
+template struct ExampleFunctor<GPUDevice, float>;
+template struct ExampleFunctor<GPUDevice, int32>;
+
+#endif  // GOOGLE_CUDA
+```
+
+<!--endzippy-->
+
 ## Build the op library
 ### Compile the op using your system compiler (TensorFlow binary installation)
 
@@ -160,7 +317,7 @@ or `clang` available on your system. The binary PIP package installs the header
 files and the library that you need to compile your op in locations that are
 system specific. However, the TensorFlow python library provides the
 `get_include` function to get the header directory.
-Here is the output of this function on a Ubuntu machine.
+Here is the output of this function on an Ubuntu machine.
 
 ```bash
 $ python
@@ -188,7 +345,7 @@ building the `.so` file.
 >   the older ABI. If you compile your op library with `gcc>=5`, add
 >   `-D_GLIBCXX_USE_CXX11_ABI=0` to the command line to make the library
 >   compatible with the older abi.
->   Furthermore if you are using TensorFlow package created from source remember to add `-cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"`
+>   Furthermore if you are using TensorFlow package created from source remember to add `--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"`
 >   as bazel command to compile the Python package.
 
 ### Compile the op using bazel (TensorFlow source installation)
@@ -225,7 +382,7 @@ TensorFlow Python API provides the
 load the dynamic library and register the op with the TensorFlow
 framework. `load_op_library` returns a Python module that contains the Python
 wrappers for the op and the kernel. Thus, once you have built the op, you can
-do the following to run it from Python :
+do the following to run it from Python:
 
 ```python
 import tensorflow as tf
@@ -1058,6 +1215,8 @@ you'll need to specify the path explicitly in the second (g++) command above.
 For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in
 `/usr/local/cuda-8.0`.
 
+>   Note in some linux settings, additional options to `nvcc` compiling step are needed. Add `-D_MWAITXINTRIN_H_INCLUDED` to the `nvcc` command line to avoid errors from `mwaitxintrin.h`.
+
 ### Implement the gradient in Python {#implement-gradient}
 
 Given a graph of ops, TensorFlow uses automatic differentiation
diff --git a/tensorflow/docs_src/extend/architecture.md b/tensorflow/docs_src/extend/architecture.md
index 42721eb488c70fdc5b350f7721623dc342720b01..21816502acec7abfca670cac1bceda3e29144b53 100644
--- a/tensorflow/docs_src/extend/architecture.md
+++ b/tensorflow/docs_src/extend/architecture.md
@@ -25,7 +25,7 @@ The TensorFlow runtime is a cross-platform library. Figure 1 illustrates its
 general architecture. A C API separates user level code in different languages
 from the core runtime.
 
-![TensorFlow Layers](../images/layers.png){: width="300"}
+![TensorFlow Layers](https://www.tensorflow.org/images/layers.png){: width="300"}
 
 **Figure 1**
 
@@ -57,7 +57,7 @@ Other tasks send updates to these parameters as they work on optimizing the
 parameters. This particular division of labor between tasks is not required, but
 it is common for distributed training.
 
-![TensorFlow Architecture Diagram](../images/diag1.svg){: width="500"}
+![TensorFlow Architecture Diagram](https://www.tensorflow.org/images/diag1.svg){: width="500"}
 
 **Figure 2**
 
@@ -91,7 +91,7 @@ In Figure 3, the client has built a graph that applies weights (w) to a
 feature vector (x), adds a bias term (b) and saves the result in a variable
 (s).
 
-![TensorFlow Architecture Diagram: Client](../images/graph_client.svg){: width="700"}
+![TensorFlow Architecture Diagram: Client](https://www.tensorflow.org/images/graph_client.svg){: width="700"}
 
 **Figure 3**
 
@@ -114,7 +114,7 @@ a step, it applies standard optimizations such as common subexpression
 elimination and constant folding. It then coordinates execution of the
 optimized subgraphs across a set of tasks.
 
-![TensorFlow Architecture Diagram: Master](../images/graph_master_cln.svg){: width="700"}
+![TensorFlow Architecture Diagram: Master](https://www.tensorflow.org/images/graph_master_cln.svg){: width="700"}
 
 **Figure 4**
 
@@ -123,7 +123,7 @@ Figure 5 shows a possible partition of our example graph. The distributed
 master has grouped the model parameters in order to place them together on the
 parameter server.
 
-![Partitioned Graph](../images/graph_split1.svg){: width="700"}
+![Partitioned Graph](https://www.tensorflow.org/images/graph_split1.svg){: width="700"}
 
 **Figure 5**
 
@@ -132,14 +132,14 @@ Where graph edges are cut by the partition, the distributed master inserts
 send and receive nodes to pass information between the distributed tasks
 (Figure 6).
 
-![Partitioned Graph](../images/graph_split2.svg){: width="700"}
+![Partitioned Graph](https://www.tensorflow.org/images/graph_split2.svg){: width="700"}
 
 **Figure 6**
 
 
 The distributed master then ships the graph pieces to the distributed tasks.
 
-![Partitioned Graph](../images/graph_workers_cln.svg){: width="700"}
+![Partitioned Graph](https://www.tensorflow.org/images/graph_workers_cln.svg){: width="700"}
 
 **Figure 7**
 
@@ -181,7 +181,7 @@ We also have preliminary support for NVIDIA's NCCL library for multi-GPU
 communication (see [`tf.contrib.nccl`](
 https://www.tensorflow.org/code/tensorflow/contrib/nccl/python/ops/nccl_ops.py)).
 
-![Partitioned Graph](../images/graph_send_recv.svg){: width="700"}
+![Partitioned Graph](https://www.tensorflow.org/images/graph_send_recv.svg){: width="700"}
 
 **Figure 8**
 
diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md
index 28f62e01ab020a3f5194a5e423c57f68e37f145a..f972ee5f50ba001347252a4ec8c054ed701aa6db 100644
--- a/tensorflow/docs_src/extend/estimators.md
+++ b/tensorflow/docs_src/extend/estimators.md
@@ -37,14 +37,17 @@ measurements. You'll learn how to do the following:
 ## Prerequisites
 
 This tutorial assumes you already know tf.contrib.learn API basics, such as
-feature columns and `fit()` operations. If you've never used tf.contrib.learn
-before, or need a refresher, you should first review the following tutorials:
+feature columns, input functions, and `fit()`/`evaluate()`/`predict()`
+operations. If you've never used tf.contrib.learn before, or need a refresher,
+you should first review the following tutorials:
 
 *   @{$tflearn$tf.contrib.learn Quickstart}: Quick introduction to
     training a neural network using tf.contrib.learn.
 *   @{$wide$TensorFlow Linear Model Tutorial}: Introduction to
     feature columns, and an overview on building a linear classifier in
     tf.contrib.learn.
+*   @{$input_fn$Building Input Functions with tf.contrib.learn}: Overview of how
+    to construct an input_fn to preprocess and feed data into your models.
 
 ## An Abalone Age Predictor {#abalone-predictor}
 
@@ -72,7 +75,7 @@ for abalone:
 
 The label to predict is number of rings, as a proxy for abalone age.
 
-![Abalone shell](../images/abalone_shell.jpg) **[“Abalone
+![Abalone shell](https://www.tensorflow.org/abalone_shell.jpg) **[“Abalone
 shell”](https://www.flickr.com/photos/thenickster/16641048623/) (by [Nicki Dugan
 Pogue](https://www.flickr.com/photos/thenickster/), CC BY-SA 2.0)**
 
@@ -239,7 +242,7 @@ nn = tf.contrib.learn.Estimator(
 *   `params`: An optional dict of hyperparameters (e.g., learning rate, dropout)
     that will be passed into the `model_fn`.
 
-NOTE: Just like `tf.contrib.learn`'s predefined regressors and classifiers, the
+Note: Just like `tf.contrib.learn`'s predefined regressors and classifiers, the
 `Estimator` initializer also accepts the general configuration arguments
 `model_dir` and `config`.
 
@@ -252,7 +255,7 @@ code (highlighted in bold below), right after the logging configuration:
 <strong># Learning rate for the model
 LEARNING_RATE = 0.001</strong></code></pre>
 
-NOTE: Here, `LEARNING_RATE` is set to `0.001`, but you can tune this value as
+Note: Here, `LEARNING_RATE` is set to `0.001`, but you can tune this value as
 needed to achieve the best results during model training.
 
 Then, add the following code to `main()`, which creates the dict `model_params`
@@ -576,7 +579,7 @@ required arguments:
         algorithm
         (@{tf.train.RMSPropOptimizer})
 
-NOTE: The `optimize_loss` function supports additional optional arguments to
+Note: The `optimize_loss` function supports additional optional arguments to
 further configure the optimizer, such as for implementing decay. See the
 @{tf.contrib.layers.optimize_loss$API docs} for more info.
 
@@ -654,15 +657,30 @@ Add the following code to the end of `main()` to fit the neural network to the
 training data and evaluate accuracy:
 
 ```python
+def get_train_inputs():
+  x = tf.constant(training_set.data)
+  y = tf.constant(training_set.target)
+  return x, y
+
 # Fit
-nn.fit(x=training_set.data, y=training_set.target, steps=5000)
+nn.fit(input_fn=get_train_inputs, steps=5000)
+
+def get_test_inputs():
+  x = tf.constant(test_set.data)
+  y = tf.constant(test_set.target)
+  return x, y
 
 # Score accuracy
-ev = nn.evaluate(x=test_set.data, y=test_set.target, steps=1)
+ev = nn.evaluate(input_fn=get_test_inputs, steps=1)
 print("Loss: %s" % ev["loss"])
 print("Root Mean Squared Error: %s" % ev["rmse"])
 ```
 
+Note: The above code uses input functions to feed feature (`x`) and label (`y`)
+`Tensor`s into the model for both training (`get_train_inputs()`) and evaluation
+(`get_test_inputs()`). To learn more about input functions, see the tutorial
+@{$input_fn$Building Input Functions with tf.contrib.learn}.
+
 Then run the code. You should see output like the following:
 
 ```none
diff --git a/tensorflow/docs_src/get_started/embedding_viz.md b/tensorflow/docs_src/get_started/embedding_viz.md
index f512d5d809b24fbed148e41115f4c0975d581975..84245b11bea455c230c6c299706f6899479d4413 100644
--- a/tensorflow/docs_src/get_started/embedding_viz.md
+++ b/tensorflow/docs_src/get_started/embedding_viz.md
@@ -21,7 +21,7 @@ interested in word embeddings,
 gives a good introduction.
 
 <video autoplay loop style="max-width: 100%;">
-  <source src="../images/embedding-mnist.mp4" type="video/mp4">
+  <source src="https://www.tensorflow.org/images/embedding-mnist.mp4" type="video/mp4">
   Sorry, your browser doesn't support HTML5 video in MP4 format.
 </video>
 
@@ -173,7 +173,7 @@ last data point in the bottom right:
 
 Note in the example above that the last row doesn't have to be filled. For a
 concrete example of a sprite, see
-[this sprite image](../images/mnist_10k_sprite.png) of 10,000 MNIST digits
+[this sprite image](https://www.tensorflow.org/images/mnist_10k_sprite.png) of 10,000 MNIST digits
 (100x100).
 
 Note: We currently support sprites up to 8192px X 8192px.
@@ -247,7 +247,7 @@ further analysis on their own with the "Isolate Points" button in the Inspector
 pane on the right hand side.
 
 
-![Selection of nearest neighbors](../images/embedding-nearest-points.png "Selection of nearest neighbors")
+![Selection of nearest neighbors](https://www.tensorflow.org/images/embedding-nearest-points.png "Selection of nearest neighbors")
 *Selection of the nearest neighbors of “important” in a word embedding dataset.*
 
 The combination of filtering with custom projection can be powerful. Below, we filtered
@@ -260,10 +260,10 @@ You can see that on the right side we have “ideas”, “science”, “perspe
 <table width="100%;">
   <tr>
     <td style="width: 30%;">
-      <img src="../images/embedding-custom-controls.png" alt="Custom controls panel" title="Custom controls panel" />
+      <img src="https://www.tensorflow.org/images/embedding-custom-controls.png" alt="Custom controls panel" title="Custom controls panel" />
     </td>
     <td style="width: 70%;">
-      <img src="../images/embedding-custom-projection.png" alt="Custom projection" title="Custom projection" />
+      <img src="https://www.tensorflow.org/images/embedding-custom-projection.png" alt="Custom projection" title="Custom projection" />
     </td>
   </tr>
   <tr>
@@ -284,4 +284,4 @@ projection) as a small file. The Projector can then be pointed to a set of one
 or more of these files, producing the panel below. Other users can then walk
 through a sequence of bookmarks.
 
-<img src="../images/embedding-bookmark.png" alt="Bookmark panel" style="width:300px;">
+<img src="https://www.tensorflow.org/images/embedding-bookmark.png" alt="Bookmark panel" style="width:300px;">
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index 6bee7529d0a787dec79ee4dfb6db58f90e3a2c0e..00cc10cd347143f23b44549672c9bb7f56eaaac6 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -123,7 +123,7 @@ TensorFlow provides a utility called TensorBoard that can display a picture of
 the computational graph. Here is a screenshot showing how TensorBoard
 visualizes the graph:
 
-![TensorBoard screenshot](../images/getting_started_add.png)
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_add.png)
 
 As it stands, this graph is not especially interesting because it always
 produces a constant result. A graph can be parameterized to accept external
@@ -154,7 +154,7 @@ resulting in the output
 
 In TensorBoard, the graph looks like this:
 
-![TensorBoard screenshot](../images/getting_started_adder.png)
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_adder.png)
 
 We can make the computational graph more complex by adding another operation.
 For example,
@@ -170,7 +170,7 @@ produces the output
 
 The preceding computational graph would look as follows in TensorBoard:
 
-![TensorBoard screenshot](../images/getting_started_triple.png)
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_triple.png)
 
 In machine learning we will typically want a model that can take arbitrary
 inputs, such as the one above.  To make the model trainable, we need to be able
@@ -336,7 +336,7 @@ program your loss will not be exactly the same, because the model is initialized
 with random values.
 
 This more complicated program can still be visualized in TensorBoard
-![TensorBoard final model visualization](../images/getting_started_final.png)
+![TensorBoard final model visualization](https://www.tensorflow.org/images/getting_started_final.png)
 
 ## `tf.contrib.learn`
 
@@ -372,25 +372,36 @@ features = [tf.contrib.layers.real_valued_column("x", dimension=1)]
 estimator = tf.contrib.learn.LinearRegressor(feature_columns=features)
 
 # TensorFlow provides many helper methods to read and set up data sets.
-# Here we use `numpy_input_fn`. We have to tell the function how many batches
+# Here we use two data sets: one for training and one for evaluation
+# We have to tell the function how many batches
 # of data (num_epochs) we want and how big each batch should be.
-x = np.array([1., 2., 3., 4.])
-y = np.array([0., -1., -2., -3.])
-input_fn = tf.contrib.learn.io.numpy_input_fn({"x":x}, y, batch_size=4,
+x_train = np.array([1., 2., 3., 4.])
+y_train = np.array([0., -1., -2., -3.])
+x_eval = np.array([2., 5., 8., 1.])
+y_eval = np.array([-1.01, -4.1, -7, 0.])
+input_fn = tf.contrib.learn.io.numpy_input_fn({"x":x_train}, y_train,
+                                              batch_size=4,
                                               num_epochs=1000)
+eval_input_fn = tf.contrib.learn.io.numpy_input_fn(
+    {"x":x_eval}, y_eval, batch_size=4, num_epochs=1000)
 
-# We can invoke 1000 training steps by invoking the `fit` method and passing the
+# We can invoke 1000 training steps by invoking the  method and passing the
 # training data set.
 estimator.fit(input_fn=input_fn, steps=1000)
 
-# Here we evaluate how well our model did. In a real example, we would want
-# to use a separate validation and testing data set to avoid overfitting.
-print(estimator.evaluate(input_fn=input_fn))
+# Here we evaluate how well our model did.
+train_loss = estimator.evaluate(input_fn=input_fn)
+eval_loss = estimator.evaluate(input_fn=eval_input_fn)
+print("train loss: %r"% train_loss)
+print("eval loss: %r"% eval_loss)
 ```
 When run, it produces
 ```
-    {'global_step': 1000, 'loss': 1.9650059e-11}
+    train loss: {'global_step': 1000, 'loss': 4.3049088e-08}
+    eval loss: {'global_step': 1000, 'loss': 0.0025487561}
 ```
+Notice how our eval data has a higher loss, but it is still close to zero.
+That means we are learning properly.
 
 ### A custom model
 
@@ -432,19 +443,25 @@ def model(features, labels, mode):
       train_op=train)
 
 estimator = tf.contrib.learn.Estimator(model_fn=model)
-# define our data set
-x = np.array([1., 2., 3., 4.])
-y = np.array([0., -1., -2., -3.])
-input_fn = tf.contrib.learn.io.numpy_input_fn({"x": x}, y, 4, num_epochs=1000)
+# define our data sets
+x_train = np.array([1., 2., 3., 4.])
+y_train = np.array([0., -1., -2., -3.])
+x_eval = np.array([2., 5., 8., 1.])
+y_eval = np.array([-1.01, -4.1, -7, 0.])
+input_fn = tf.contrib.learn.io.numpy_input_fn({"x": x_train}, y_train, 4, num_epochs=1000)
 
 # train
 estimator.fit(input_fn=input_fn, steps=1000)
-# evaluate our model
-print(estimator.evaluate(input_fn=input_fn, steps=10))
+# Here we evaluate how well our model did. 
+train_loss = estimator.evaluate(input_fn=input_fn)
+eval_loss = estimator.evaluate(input_fn=eval_input_fn)
+print("train loss: %r"% train_loss)
+print("eval loss: %r"% eval_loss)
 ```
 When run, it produces
-```python
-{'loss': 5.9819476e-11, 'global_step': 1000}
+```
+train loss: {'global_step': 1000, 'loss': 4.9380226e-11}
+eval loss: {'global_step': 1000, 'loss': 0.01010081}
 ```
 
 Notice how the contents of the custom `model()` function are very similar
diff --git a/tensorflow/docs_src/get_started/graph_viz.md b/tensorflow/docs_src/get_started/graph_viz.md
index b69103299ea151a948954b9598a36f9a4a12f969..06ec427b757d6a34270b646341786bc8925473d5 100644
--- a/tensorflow/docs_src/get_started/graph_viz.md
+++ b/tensorflow/docs_src/get_started/graph_viz.md
@@ -2,7 +2,7 @@
 
 TensorFlow computation graphs are powerful but complicated. The graph visualization can help you understand and debug them. Here's an example of the visualization at work.
 
-![Visualization of a TensorFlow graph](../images/graph_vis_animation.gif "Visualization of a TensorFlow graph")
+![Visualization of a TensorFlow graph](https://www.tensorflow.org/images/graph_vis_animation.gif "Visualization of a TensorFlow graph")
 *Visualization of a TensorFlow graph.*
 
 To see your own graph, run TensorBoard pointing it to the log directory of the job, click on the graph tab on the top pane and select the appropriate run using the menu at the upper left corner. For in depth information on how to run TensorBoard and make sure you are logging all the necessary information, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning}.
@@ -43,10 +43,10 @@ expanded states.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/pool1_collapsed.png" alt="Unexpanded name scope" title="Unexpanded name scope" />
+      <img src="https://www.tensorflow.org/images/pool1_collapsed.png" alt="Unexpanded name scope" title="Unexpanded name scope" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/pool1_expanded.png" alt="Expanded name scope" title="Expanded name scope" />
+      <img src="https://www.tensorflow.org/images/pool1_expanded.png" alt="Expanded name scope" title="Expanded name scope" />
     </td>
   </tr>
   <tr>
@@ -87,10 +87,10 @@ and the auxiliary area.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/conv_1.png" alt="conv_1 is part of the main graph" title="conv_1 is part of the main graph" />
+      <img src="https://www.tensorflow.org/images/conv_1.png" alt="conv_1 is part of the main graph" title="conv_1 is part of the main graph" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/save.png" alt="save is extracted as auxiliary node" title="save is extracted as auxiliary node" />
+      <img src="https://www.tensorflow.org/images/save.png" alt="save is extracted as auxiliary node" title="save is extracted as auxiliary node" />
     </td>
   </tr>
   <tr>
@@ -114,10 +114,10 @@ specific set of nodes.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/series.png" alt="Sequence of nodes" title="Sequence of nodes" />
+      <img src="https://www.tensorflow.org/images/series.png" alt="Sequence of nodes" title="Sequence of nodes" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/series_expanded.png" alt="Expanded sequence of nodes" title="Expanded sequence of nodes" />
+      <img src="https://www.tensorflow.org/images/series_expanded.png" alt="Expanded sequence of nodes" title="Expanded sequence of nodes" />
     </td>
   </tr>
   <tr>
@@ -135,15 +135,15 @@ for constants and summary nodes. To summarize, here's a table of node symbols:
 
 Symbol | Meaning
 --- | ---
-![Name scope](../images/namespace_node.png "Name scope") | *High-level* node representing a name scope. Double-click to expand a high-level node.
-![Sequence of unconnected nodes](../images/horizontal_stack.png "Sequence of unconnected nodes") | Sequence of numbered nodes that are not connected to each other.
-![Sequence of connected nodes](../images/vertical_stack.png "Sequence of connected nodes") | Sequence of numbered nodes that are connected to each other.
-![Operation node](../images/op_node.png "Operation node") | An individual operation node.
-![Constant node](../images/constant.png "Constant node") | A constant.
-![Summary node](../images/summary.png "Summary node") | A summary node.
-![Data flow edge](../images/dataflow_edge.png "Data flow edge") | Edge showing the data flow between operations.
-![Control dependency edge](../images/control_edge.png "Control dependency edge") | Edge showing the control dependency between operations.
-![Reference edge](../images/reference_edge.png "Reference edge") | A reference edge showing that the outgoing operation node can mutate the incoming tensor.
+![Name scope](https://www.tensorflow.org/images/namespace_node.png "Name scope") | *High-level* node representing a name scope. Double-click to expand a high-level node.
+![Sequence of unconnected nodes](https://www.tensorflow.org/images/horizontal_stack.png "Sequence of unconnected nodes") | Sequence of numbered nodes that are not connected to each other.
+![Sequence of connected nodes](https://www.tensorflow.org/images/vertical_stack.png "Sequence of connected nodes") | Sequence of numbered nodes that are connected to each other.
+![Operation node](https://www.tensorflow.org/images/op_node.png "Operation node") | An individual operation node.
+![Constant node](https://www.tensorflow.org/images/constant.png "Constant node") | A constant.
+![Summary node](https://www.tensorflow.org/images/summary.png "Summary node") | A summary node.
+![Data flow edge](https://www.tensorflow.org/images/dataflow_edge.png "Data flow edge") | Edge showing the data flow between operations.
+![Control dependency edge](https://www.tensorflow.org/images/control_edge.png "Control dependency edge") | Edge showing the control dependency between operations.
+![Reference edge](https://www.tensorflow.org/images/reference_edge.png "Reference edge") | A reference edge showing that the outgoing operation node can mutate the incoming tensor.
 
 ## Interaction {#interaction}
 
@@ -161,10 +161,10 @@ right corner of the visualization.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/infocard.png" alt="Info card of a name scope" title="Info card of a name scope" />
+      <img src="https://www.tensorflow.org/images/infocard.png" alt="Info card of a name scope" title="Info card of a name scope" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/infocard_op.png" alt="Info card of operation node" title="Info card of operation node" />
+      <img src="https://www.tensorflow.org/images/infocard_op.png" alt="Info card of operation node" title="Info card of operation node" />
     </td>
   </tr>
   <tr>
@@ -207,10 +207,10 @@ The images below give an illustration for a piece of a real-life graph.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/colorby_structure.png" alt="Color by structure" title="Color by structure" />
+      <img src="https://www.tensorflow.org/images/colorby_structure.png" alt="Color by structure" title="Color by structure" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/colorby_device.png" alt="Color by device" title="Color by device" />
+      <img src="https://www.tensorflow.org/images/colorby_device.png" alt="Color by device" title="Color by device" />
     </td>
   </tr>
   <tr>
@@ -233,7 +233,7 @@ The images below show the CIFAR-10 model with tensor shape information:
 <table width="100%;">
   <tr>
     <td style="width: 100%;">
-      <img src="../images/tensor_shapes.png" alt="CIFAR-10 model with tensor shape information" title="CIFAR-10 model with tensor shape information" />
+      <img src="https://www.tensorflow.org/images/tensor_shapes.png" alt="CIFAR-10 model with tensor shape information" title="CIFAR-10 model with tensor shape information" />
     </td>
   </tr>
   <tr>
@@ -303,13 +303,13 @@ tensor output sizes.
 <table width="100%;">
   <tr style="height: 380px">
     <td>
-      <img src="../images/colorby_compute_time.png" alt="Color by compute time" title="Color by compute time"/>
+      <img src="https://www.tensorflow.org/images/colorby_compute_time.png" alt="Color by compute time" title="Color by compute time"/>
     </td>
     <td>
-      <img src="../images/run_metadata_graph.png" alt="Run metadata graph" title="Run metadata graph" />
+      <img src="https://www.tensorflow.org/images/run_metadata_graph.png" alt="Run metadata graph" title="Run metadata graph" />
     </td>
     <td>
-      <img src="../images/run_metadata_infocard.png" alt="Run metadata info card" title="Run metadata info card" />
+      <img src="https://www.tensorflow.org/images/run_metadata_infocard.png" alt="Run metadata info card" title="Run metadata info card" />
     </td>
   </tr>
 </table>
diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md
index 74ed5fbebff26c3c31af533acaabab82d6a8bc05..a053617b5895bd2a92784e64b4dfd6f1ac35ab53 100644
--- a/tensorflow/docs_src/get_started/input_fn.md
+++ b/tensorflow/docs_src/get_started/input_fn.md
@@ -12,7 +12,7 @@ When training a neural network using tf.contrib.learn, it's possible to pass
 your feature and target data directly into your `fit`, `evaluate`, or `predict`
 operations. Here's an example taken from the @{$tflearn$tf.contrib.learn quickstart tutorial}:
 
-```py
+```python
 training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
     filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
 test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index d688d27ae2ce119f9edd27a184c1e1e8b8c1f40c..812f248d3ebfdf7439d9324b47825c2facf951c2 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -9,3 +9,4 @@ monitors.md
 summaries_and_tensorboard.md
 embedding_viz.md
 graph_viz.md
+tensorboard_histograms.md
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
index b9a85f3f676c6d51a19fa475dc56c6604a34c0ea..624d91647484bb0adf85b47179c2ac686ffc890f 100644
--- a/tensorflow/docs_src/get_started/mnist/beginners.md
+++ b/tensorflow/docs_src/get_started/mnist/beginners.md
@@ -15,7 +15,7 @@ MNIST is a simple computer vision dataset. It consists of images of handwritten
 digits like these:
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/MNIST.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/MNIST.png">
 </div>
 
 It also includes labels for each image, telling us which digit it is. For
@@ -88,7 +88,7 @@ Each image is 28 pixels by 28 pixels. We can interpret this as a big array of
 numbers:
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/MNIST-Matrix.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/MNIST-Matrix.png">
 </div>
 
 We can flatten this array into a vector of 28x28 = 784 numbers. It doesn't
@@ -110,7 +110,7 @@ Each entry in the tensor is a pixel intensity between 0 and 1, for a particular
 pixel in a particular image.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/mnist-train-xs.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/mnist-train-xs.png">
 </div>
 
 Each image in MNIST has a corresponding label, a number between 0 and 9
@@ -124,7 +124,7 @@ vector which is 1 in the \\(n\\)th dimension. For example, 3 would be
 `[55000, 10]` array of floats.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/mnist-train-ys.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/mnist-train-ys.png">
 </div>
 
 We're now ready to actually make our model!
@@ -157,7 +157,7 @@ classes. Red represents negative weights, while blue represents positive
 weights.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/softmax-weights.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-weights.png">
 </div>
 
 We also add some extra evidence called a bias. Basically, we want to be able
@@ -202,13 +202,13 @@ although with a lot more \\(x\\)s. For each output, we compute a weighted sum of
 the \\(x\\)s, add a bias, and then apply softmax.
 
 <div style="width:55%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/softmax-regression-scalargraph.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-scalargraph.png">
 </div>
 
 If we write that out as equations, we get:
 
 <div style="width:52%; margin-left:25%; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/softmax-regression-scalarequation.png"
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-scalarequation.png"
    alt="[y1, y2, y3] = softmax(W11*x1 + W12*x2 + W13*x3 + b1,  W21*x1 + W22*x2 + W23*x3 + b2,  W31*x1 + W32*x2 + W33*x3 + b3)">
 </div>
 
@@ -217,7 +217,7 @@ and vector addition. This is helpful for computational efficiency. (It's also
 a useful way to think.)
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/softmax-regression-vectorequation.png"
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-vectorequation.png"
  alt="[y1, y2, y3] = softmax([[W11, W12, W13], [W21, W22, W23], [W31, W32, W33]]*[x1, x2, x3] + [b1, b2, b3])">
 </div>
 
@@ -362,7 +362,7 @@ minimize. Then it can apply your choice of optimization algorithm to modify the
 variables and reduce the loss.
 
 ```python
-train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
+train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)
 ```
 
 In this case, we ask TensorFlow to minimize `cross_entropy` using the
diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
index b55a5c19ff9db1b407924d52b11079b4b16f5ff8..48d9a395f2859e81cf9627f37ce2677f3479ce22 100644
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md
@@ -34,7 +34,7 @@ MNIST is a classic problem in machine learning. The problem is to look at
 greyscale 28x28 pixel images of handwritten digits and determine which digit
 the image represents, for all the digits from zero to nine.
 
-![MNIST Digits](../../images/mnist_digits.png "MNIST Digits")
+![MNIST Digits](https://www.tensorflow.org/images/mnist_digits.png "MNIST Digits")
 
 For more information, refer to [Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/)
 or [Chris Olah's visualizations of MNIST](http://colah.github.io/posts/2014-10-Visualizing-MNIST/).
@@ -90,7 +90,7 @@ loss.
 and apply gradients.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/mnist_subgraph.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/mnist_subgraph.png">
 </div>
 
 ### Inference
@@ -384,7 +384,7 @@ summary_writer.add_summary(summary_str, step)
 When the events files are written, TensorBoard may be run against the training
 folder to display the values from the summaries.
 
-![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")
+![MNIST TensorBoard](https://www.tensorflow.org/images/mnist_tensorboard.png "MNIST TensorBoard")
 
 **NOTE**: For more info about how to build and run Tensorboard, please see the accompanying tutorial @{$summaries_and_tensorboard$Tensorboard: Visualizing Learning}.
 
diff --git a/tensorflow/docs_src/get_started/monitors.md b/tensorflow/docs_src/get_started/monitors.md
index 7db88c89812af50be6a52e110db4f9bc60aac2d0..d9c605b013cca5e4bad21fd7167a0cca345c3251 100644
--- a/tensorflow/docs_src/get_started/monitors.md
+++ b/tensorflow/docs_src/get_started/monitors.md
@@ -65,7 +65,7 @@ if __name__ == "__main__":
 
 Copy the above code into a file, and download the corresponding
 [training](http://download.tensorflow.org/data/iris_training.csv) and
-@{tf.test} data sets to the same
+[test](http://download.tensorflow.org/data/iris_test.csv) data sets to the same
 directory.
 
 In the following sections, you'll progressively make updates to the above code
@@ -401,6 +401,6 @@ Then navigate to `http://0.0.0.0:`*`<port_number>`* in your browser, where
 If you click on the accuracy field, you'll see an image like the following,
 which shows accuracy plotted against step count:
 
-![Accuracy over step count in TensorBoard](../images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard")
+![Accuracy over step count in TensorBoard](https://www.tensorflow.org/images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard")
 
 For more on using TensorBoard, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning} and @{$graph_viz$TensorBoard: Graph Visualization}.
diff --git a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
index 6e06c9e41e4c16c370584f4402d42238adddebae..45d43e7a6e76ef9adc95cf2ebe5fe346de22caee 100644
--- a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
@@ -8,7 +8,7 @@ your TensorFlow graph, plot quantitative metrics about the execution of your
 graph, and show additional data like images that pass through it. When
 TensorBoard is fully configured, it looks like this:
 
-![MNIST TensorBoard](../images/mnist_tensorboard.png "MNIST TensorBoard")
+![MNIST TensorBoard](https://www.tensorflow.org/images/mnist_tensorboard.png "MNIST TensorBoard")
 
 <div class="video-wrapper">
   <iframe class="devsite-embedded-youtube-video" data-video-id="eBbEDRsCmv4"
diff --git a/tensorflow/docs_src/get_started/tensorboard_histograms.md b/tensorflow/docs_src/get_started/tensorboard_histograms.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3dd13497eb598d7e86efae5529396bd472edc31
--- /dev/null
+++ b/tensorflow/docs_src/get_started/tensorboard_histograms.md
@@ -0,0 +1,243 @@
+# TensorBoard Histogram Dashboard
+
+The TensorBoard Histogram Dashboard displays how the distribution of some
+`Tensor` in your TensorFlow graph has changed over time. It does this by showing
+many histograms visualizations of your tensor at different points in time.
+
+## A Basic Example
+
+Let's start with a simple case: a normally-distributed variable, where the mean
+shifts over time.
+TensorFlow has an op
+[`tf.random_normal`](https://www.tensorflow.org/api_docs/python/tf/random_normal)
+which is perfect for this purpose. As is usually the case with TensorBoard, we
+will ingest data using a summary op; in this case,
+['tf.summary.histogram'](https://www.tensorflow.org/api_docs/python/tf/summary/histogram).
+For a primer on how summaries work, please see the general
+[TensorBoard tutorial](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+
+Here is a code snippet that will generate some histogram summaries containing
+normally distributed data, where the mean of the distribution increases over
+time.
+
+```python
+import tensorflow as tf
+
+k = tf.placeholder(tf.float32)
+
+# Make a normal distribution, with a shifting mean
+mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
+# Record that distribution into a histogram summary
+tf.summary.histogram("normal/moving_mean", mean_moving_normal)
+
+# Setup a session and summary writer
+sess = tf.Session()
+writer = tf.summary.FileWriter("/tmp/histogram_example")
+
+# Setup a loop and write the summaries to disk
+N = 400
+for step in range(N):
+  k_val = step/float(N)
+  summ = sess.run(summaries, feed_dict={k: k_val})
+  writer.add_summary(summ, global_step=step)
+```
+
+Once that code runs, we can load the data into TensorBoard via the command line:
+
+
+```sh
+tensorboard --logdir=/tmp/histogram_example
+```
+
+Once TensorBoard is running, load it in Chrome or Firefox and navigate to the
+Histogram Dashboard. Then we can see a histogram visualization for our normally
+distributed data.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/1_moving_mean.png)
+
+`tf.summary.histogram` takes an arbitrarily sized and shaped Tensor, and
+compresses it into a histogram data structure consisting of many bins with
+widths and counts. For example, let's say we want to organize the numbers
+`[0.5, 1.1, 1.3, 2.2, 2.9, 2.99]` into bins. We could make three bins:
+* a bin
+containing everything from 0 to 1 (it would contain one element, 0.5),
+* a bin
+containing everything from 1-2 (it would contain two elements, 1.1 and 1.3),
+* a bin containing everything from 2-3 (it would contain three elements: 2.2,
+2.9 and 2.99).
+
+TensorFlow uses a similar approach to create bins, but unlike in our example, it
+doesn't create integer bins. For large, sparse datasets, that might result in
+many thousands of bins.
+Instead, [the bins are exponentially distributed, with many bins close to 0 and
+comparatively few bins for very large numbers.](https://github.com/tensorflow/tensorflow/blob/c8b59c046895fa5b6d79f73e0b5817330fcfbfc1/tensorflow/core/lib/histogram/histogram.cc#L28)
+However, visualizing exponentially-distributed bins is tricky; if height is used
+to encode count, then wider bins take more space, even if they have the same
+number of elements. Conversely, encoding count in the area makes height
+comparisons impossible. Instead, the histograms [resample the data](https://github.com/tensorflow/tensorflow/blob/17c47804b86e340203d451125a721310033710f1/tensorflow/tensorboard/components/tf_backend/backend.ts#L400)
+into uniform bins. This can lead to unfortunate artifacts in some cases.
+
+Each slice in the histogram visualizer displays a single histogram.
+The slices are organized by step;
+older slices (e.g. step 0) are further "back" and darker, while newer slices
+(e.g. step 400) are close to the foreground, and lighter in color.
+The y-axis on the right shows the step number.
+
+You can mouse over the histogram to see tooltips with some more detailed
+information. For example, in the following image we can see that the histogram
+at timestep 176 has a bin centered at 2.25 with 177 elements in that bin.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/2_moving_mean_tooltip.png)
+
+Also, you may note that the histogram slices are not always evenly spaced in
+step count or time. This is because TensorBoard uses
+[reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) to keep a
+subset of all the histograms, to save on memory. Reservoir sampling guarantees
+that every sample has an equal likelihood of being included, but because it is
+a randomized algorithm, the samples chosen don't occur at even steps.
+
+## Overlay Mode
+
+There is a control on the left of the dashboard that allows you to toggle the
+histogram mode from "offset" to "overlay":
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/3_overlay_offset.png)
+
+In "offset" mode, the visualization rotates 45 degrees, so that the individual
+histogram slices are no longer spread out in time, but instead are all plotted
+on the same y-axis.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/4_overlay.png)
+Now, each slice is a separate line on the chart, and the y-axis shows the item
+count within each bucket. Darker lines are older, earlier steps, and lighter
+lines are more recent, later steps. Once again, you can mouse over the chart to
+see some additional information.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/5_overlay_tooltips.png)
+
+In general, the overlay visualization is useful if you want to directly compare
+the counts of different histograms.
+
+## Multimodal Distributions
+
+The Histogram Dashboard is great for visualizing multimodal
+distributions. Let's construct a simple bimodal distribution by concatenating
+the outputs from two different normal distributions. The code will look like
+this:
+
+```python
+import tensorflow as tf
+
+k = tf.placeholder(tf.float32)
+
+# Make a normal distribution, with a shifting mean
+mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
+# Record that distribution into a histogram summary
+tf.summary.histogram("normal/moving_mean", mean_moving_normal)
+
+# Make a normal distribution with shrinking variance
+variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k))
+# Record that distribution too
+tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal)
+
+# Let's combine both of those distributions into one dataset
+normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
+# We add another histogram summary to record the combined distribution
+tf.summary.histogram("normal/bimodal", normal_combined)
+
+summaries = tf.summary.merge_all()
+
+# Setup a session and summary writer
+sess = tf.Session()
+writer = tf.summary.FileWriter("/tmp/histogram_example")
+
+# Setup a loop and write the summaries to disk
+N = 400
+for step in range(N):
+  k_val = step/float(N)
+  summ = sess.run(summaries, feed_dict={k: k_val})
+  writer.add_summary(summ, global_step=step)
+```
+
+You already remember our "moving mean" normal distribution from the example
+above. Now we also have a "shrinking variance" distribution. Side-by-side, they
+look like this:
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/6_two_distributions.png)
+
+When we concatenate them, we get a chart that clearly reveals the divergent,
+bimodal structure:
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/7_bimodal.png)
+
+## Some more distributions
+
+Just for fun, let's generate and visualize a few more distributions, and then
+combine them all into one chart. Here's the code we'll use:
+
+```python
+import tensorflow as tf
+
+k = tf.placeholder(tf.float32)
+
+# Make a normal distribution, with a shifting mean
+mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
+# Record that distribution into a histogram summary
+tf.summary.histogram("normal/moving_mean", mean_moving_normal)
+
+# Make a normal distribution with shrinking variance
+variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k))
+# Record that distribution too
+tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal)
+
+# Let's combine both of those distributions into one dataset
+normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
+# We add another histogram summary to record the combined distribution
+tf.summary.histogram("normal/bimodal", normal_combined)
+
+# Add a gamma distribution
+gamma = tf.random_gamma(shape=[1000], alpha=k)
+tf.summary.histogram("gamma", gamma)
+
+# And a poisson distribution
+poisson = tf.random_poisson(shape=[1000], lam=k)
+tf.summary.histogram("poisson", poisson)
+
+# And a uniform distribution
+uniform = tf.random_uniform(shape=[1000], maxval=k*10)
+tf.summary.histogram("uniform", uniform)
+
+# Finally, combine everything together!
+all_distributions = [mean_moving_normal, variance_shrinking_normal,
+                     gamma, poisson, uniform]
+all_combined = tf.concat(all_distributions, 0)
+tf.summary.histogram("all_combined", all_combined)
+
+summaries = tf.summary.merge_all()
+
+# Setup a session and summary writer
+sess = tf.Session()
+writer = tf.summary.FileWriter("/tmp/histogram_example")
+
+# Setup a loop and write the summaries to disk
+N = 400
+for step in range(N):
+  k_val = step/float(N)
+  summ = sess.run(summaries, feed_dict={k: k_val})
+  writer.add_summary(summ, global_step=step)
+```
+### Gamma Distribution
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/8_gamma.png)
+
+### Uniform Distribution
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/9_uniform.png)
+
+### Poisson Distribution
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/10_poisson.png)
+The poisson distribution is defined over the integers. So, all of the values
+being generated are perfect integers. The histogram compression moves the data
+into floating-point bins, causing the visualization to show little
+bumps over the integer values rather than perfect spikes.
+
+### All Together Now
+Finally, we can concatenate all of the data into one funny-looking curve.
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/11_all_combined.png)
+
diff --git a/tensorflow/docs_src/get_started/tflearn.md b/tensorflow/docs_src/get_started/tflearn.md
index 4a893e4a45bf43cf3c1d58d3025b4f70d20921d7..ed21969b3e9fe98428c19da6baa822a5395abd3e 100644
--- a/tensorflow/docs_src/get_started/tflearn.md
+++ b/tensorflow/docs_src/get_started/tflearn.md
@@ -118,7 +118,7 @@ The [Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) contains
 150 rows of data, comprising 50 samples from each of three related Iris species:
 *Iris setosa*, *Iris virginica*, and *Iris versicolor*.
 
-![Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor](../images/iris_three_species.jpg) **From left to right,
+![Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor](https://www.tensorflow.org/images/iris_three_species.jpg) **From left to right,
 [*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
 [Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
 [*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
@@ -278,7 +278,7 @@ Then, the code creates a `DNNClassifier` model using the following arguments:
 
 The `tf.contrib.learn` API uses input functions, which create the TensorFlow
 operations that generate data for the model. In this case, the data is small
-enough that it can be stored in @{tf.constant TensorFlow constants}. The
+enough that it can be stored in @{tf.constant$TensorFlow constants}. The
 following code produces the simplest possible input pipeline:
 
 ```python
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 111b046689eb91ec2c864f274128434f29ffc8ff..72d0c7b1ff49a674eac5de1a35379d8452e793f9 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -50,7 +50,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
          <project>
              <modelVersion>4.0.0</modelVersion>
              <groupId>org.myorg</groupId>
-             <artifactId>label-image</artifactId>
+             <artifactId>hellotf</artifactId>
              <version>1.0-SNAPSHOT</version>
              <properties>
                <exec.mainClass>HelloTF</exec.mainClass>
@@ -106,7 +106,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
 
 
 The preceding command should output <tt>Hello from <i>version</i></tt>. If it
-does, you've succesfully set up TensorFlow for Java and are ready to use it in
+does, you've successfully set up TensorFlow for Java and are ready to use it in
 Maven projects. If not, check
 [Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow)
 for possible solutions.  You can skip reading the rest of this document.
@@ -211,15 +211,20 @@ two files are available to the JVM:
   * the downloaded `.jar` file
   * the extracted JNI library
 
-For example, the following command line executes the `HelloTF` program:
+For example, the following command line executes the `HelloTF` program on Linux
+and Mac OS X:
 
 <pre><b>java -cp libtensorflow-1.1.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
+And the following comand line executes the `HelloTF` program on Windows:
+
+<pre><b>java -cp libtensorflow-1.1.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
 outputs something else, check
-[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow)
-for possible solutions.
+[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow) for
+possible solutions.
 
 
 ### Advanced Example
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 85b4dcacb2a7ffadbde51a0ebd2bf3d05cbb5af1..73e143b145b86a66f3d6be7f5c04a9a69933041c 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -2,55 +2,7 @@
 
 This guide explains how to install TensorFlow on Mac OS X.
 
-## Determine which TensorFlow to install
-
-You must choose the type of TensorFlow to install.  Your choices are as follows:
-
-  * **TensorFlow with CPU support only**. If your system does not have a
-    NVIDIA CUDA® GPU, you should install this version. Note that TensorFlow
-    with CPU support is typically easier to install than TensorFlow with
-    GPU support. Therefore, even if you have an NVIDIA CUDA GPU, we recommend
-    installing this version first as a diagnostic step just in case you run
-    into problems installing TensorFlow with GPU support.
-  * **TensorFlow with GPU support**. TensorFlow programs typically run
-    significantly faster on a GPU than on a CPU. Therefore, if your system has
-    a NVIDIA CUDA GPU meeting the prerequisites shown below and you need
-    to run performance-critical applications, you should ultimately
-    install this version.
-
-
-### Requirements to run TensorFlow with GPU support
-
-If you are installing TensorFlow with GPU support using one of the mechanisms
-described in this guide, then the following NVIDIA software must be
-installed on your system:
-
-
-  * CUDA Toolkit 8.0. For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x).
-    Ensure that you append the relevant CUDA pathnames to the
-    `LD_LIBRARY_PATH` environment variable as described in the
-    NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v5.1. For details, see
-    [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
-    Ensure that you create the `CUDA_HOME` environment variable as described in
-    the NVIDIA documentation.
-  * GPU card with CUDA Compute Capability 3.0 or higher.  See
-    [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus)
-    for a list of supported GPU cards.
-
-If you have an earlier version of the preceding packages, please upgrade to
-the specified versions. If upgrading is not possible, you may still run
-TensorFlow with GPU support, but only if you do both of the following:
-
-  * Install TensorFlow from sources as described
-    @{$install_sources$here}.
-  * Install or upgrade to at least the following NVIDIA versions:
-    * CUDA toolkit 7.0 or greater
-    * cuDNN v3 or greater
-    * GPU card with CUDA Compute Capability 3.0 or higher.
-
+Note: As of version 1.2, TensorFlow no longer provides GPU support on Mac OS X.
 
 ## Determine how to install TensorFlow
 
@@ -88,10 +40,6 @@ large (hundreds of MBs). You might choose the Docker installation if you are
 incorporating TensorFlow into a larger application architecture that
 already uses Docker.
 
-Important: Docker currently does not support TensorFlow with GPU support
-on Mac OS; that is, on Mac OS, Docker only supports TensorFlow with
-CPU support.
-
 In Anaconda, you may use conda to create a virtual environment.
 However, within Anaconda, we recommend installing TensorFlow with the
 `pip install` command, not with the `conda install` command.
@@ -139,8 +87,6 @@ Take the following steps to install TensorFlow with Virtualenv:
 
      <pre> $ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
      $ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
-     $ <b>pip install --upgrade tensorflow-gpu</b>  # for Python 2.7 and GPU
-     $ <b>pip3 install --upgrade tensorflow-gpu</b> # for Python 3.n and GPU </pre>
 
      If the preceding command succeed, skip Step 6. If it failed,
      perform Step 6.
@@ -154,19 +100,19 @@ Take the following steps to install TensorFlow with Virtualenv:
 
      where <i>tfBinaryURL</i> identifies the URL
      of the TensorFlow Python package. The appropriate value of
-     <i>tfBinaryURL</i> depends on the operating system,
-     Python version, and GPU support. Find the appropriate value for
+     <i>tfBinaryURL</i> depends on the operating system and
+     Python version. Find the appropriate value for
      <i>tfBinaryURL</i> for your system
      [here](#the_url_of_the_tensorflow_python_package).
      For example, if you are installing TensorFlow for Mac OS X,
-     Python 2.7, and CPU-only support, the command to install
+     Python 2.7, the command to install
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0rc2-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
-[Common Installation Problems](#CommonInstallationProblems).
+[Common Installation Problems](#common-installation-problems).
 
 
 ### Next Steps
@@ -263,10 +209,8 @@ take the following steps:
 
   1. Install TensorFlow by invoking **one** of the following commands:
 
-     <pre> $ <b>pip install tensorflow</b>      # Python 2.7; CPU support (no GPU support)
-     $ <b>pip3 install tensorflow</b>     # Python 3.n; CPU support (no GPU support)
-     $ <b>pip install tensorflow-gpu</b>  # Python 2.7;  GPU support
-     $ <b>pip3 install tensorflow-gpu</b> # Python 3.n; GPU support </pre>
+     <pre> $ <b>pip install tensorflow</b>      # Python 2.7; CPU support
+     $ <b>pip3 install tensorflow</b>     # Python 3.n; CPU support
 
      If the preceding command runs to completion, you should now
      [validate your installation](#ValidateYourInstallation).
@@ -279,17 +223,17 @@ take the following steps:
 
      where <i>tfBinaryURL</i> identifies the URL of the TensorFlow Python
      package. The appropriate value of <i>tfBinaryURL</i> depends on the
-     operating system, Python version, and GPU support. Find the appropriate
+     operating system and Python version. Find the appropriate
      value for <i>tfBinaryURL</i>
      [here](#the_url_of_the_tensorflow_python_package).  For example, if
-     you are installing TensorFlow for Mac OS, Python 2.7, and CPU-only
-     support, issue the following command:
+     you are installing TensorFlow for Mac OS and Python 2.7
+     issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0rc2-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
-     [Common installation problems](#CommonInstallationProblems).
+     [installation problems](#common-installation-problems).
 
 
 
@@ -320,9 +264,6 @@ Follow these steps to install TensorFlow through Docker.
 
 The remainder of this section explains how to launch a Docker container.
 
-**Note**: You may only launch a Docker container with CPU support.
-(Docker doesn't provide GPU support on Mac OS.)
-
 To launch a Docker container that holds the TensorFlow binary image,
 enter a command of the following format:
 
@@ -398,7 +339,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0rc2-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -468,17 +409,6 @@ the `tensorflow` tag.
 <table>
 <tr> <th>Stack Overflow Link</th> <th>Error Message</th> </tr>
 
-<tr>
-  <td><a href="https://stackoverflow.com/q/36159194">36159194</a></td>
-  <td><pre>ImportError: libcudart.so.<i>Version</i>: cannot open shared object file:
-  No such file or directory</pre></td>
-</tr>
-
-<tr>
-  <td><a href="https://stackoverflow.com/q/41991101">41991101</a></td>
-  <td><pre>ImportError: libcudnn.<i>Version</i>: cannot open shared object file:
-  No such file or directory</pre></td>
-</tr>
 
 <tr>
   <td><a href="http://stackoverflow.com/q/42006320">42006320</a></td>
@@ -544,17 +474,6 @@ ImportError: cannot import name 'descriptor'</pre>
   </td>
 </tr>
 
-<tr>
-  <td><a href="http://stackoverflow.com/q/42073336">42073336</a></td>
-  <td>An <tt>import tensorflow</tt> statement triggers the following error:
-<pre>
->>> import tensorflow as tf
-I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.dylib locally
-I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.dylib locally
-I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.dylib locally
-"import tensorflow" terminated by signal SIGSEGV (Address boundary error)
-</pre></td>
-</tr>
 
 <tr>
   <td><a href="http://stackoverflow.com/q/42075397">42075397</a></td>
@@ -572,15 +491,6 @@ Terminal window to review and agree to the Xcode license agreements.
 RuntimeError: Broken toolchain: cannot link a simple C program</pre>
 </td>
 
-<tr>
-  <td><a href="http://stackoverflow.com/questions/42376790/">42376790</a></td>
-  <td>After installing for GPU, an `import tensorflow` statement
-      triggers the following error:
-<pre>tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA
-  library libcublas.8.0.dylib locally
-  Segmentation fault: 11
-</pre></td>
-</tr>
 
 </table>
 
@@ -595,45 +505,24 @@ The value you specify depends on three factors:
 
   * operating system
   * Python version
-  * CPU only vs. GPU support
 
 This section documents the relevant values for Mac OS installations.
 
 ### Python 2.7
 
-CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0rc2-py2-none-any.whl
 </pre>
 
-GPU support:
-
-<pre>
-https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.1.0-py2-none-any.whl
-</pre>
-
-Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see
-[Installing TensorFlow from Sources](install_sources.md).
-
 
 ### Python 3.4, 3.5, or 3.6
 
-CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0rc2-py3-none-any.whl
 </pre>
 
-GPU support:
-
-<pre>
-https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.1.0-py3-none-any.whl
-</pre>
-
-Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see
-[Installing TensorFlow from Sources](install_sources.md).
-
 
 
 <a name="Protobuf31"></a>
diff --git a/tensorflow/docs_src/performance/benchmarks.md b/tensorflow/docs_src/performance/benchmarks.md
new file mode 100644
index 0000000000000000000000000000000000000000..47ab028e2058b5d7f722604ecc3eeb9753270ead
--- /dev/null
+++ b/tensorflow/docs_src/performance/benchmarks.md
@@ -0,0 +1,414 @@
+# Benchmarks
+
+## Overview
+
+A selection of image classification models were tested across multiple platforms
+to create a point of reference for the TensorFlow community. The
+[Methodology](#methodology) section details how the tests were executed and has
+links to the scripts used.
+
+## Results for image classification models
+
+InceptionV3 ([arXiv:1512.00567](https://arxiv.org/abs/1512.00567)), ResNet-50
+([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), ResNet-152
+([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), VGG16
+([arXiv:1409.1556](https://arxiv.org/abs/1409.1556)), and
+[AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)
+were tested using the [ImageNet](http://www.image-net.org/) data set. Tests were
+run on Google Compute Engine, Amazon Elastic Compute Cloud (Amazon EC2), and an
+NVIDIA® DGX-1™. Most of the tests were run with both synthetic and real data.
+Testing with synthetic data was done by using a `tf.Variable` set to the same
+shape as the data expected by each model for ImageNet. We believe it is
+important to include real data measurements when benchmarking a platform. This
+load tests both the underlying hardware and the framework at preparing data for
+actual training. We start with synthetic data to remove disk I/O as a variable
+and to set a baseline. Real data is then used to verify that the TensorFlow
+input pipeline and the underlying disk I/O are saturating the compute units.
+
+### Training with NVIDIA® DGX-1™ (NVIDIA® Tesla® P100)
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_p100_single_server.png">
+</div>
+
+Details and additional results are in the [Details for NVIDIA® DGX-1™ (NVIDIA®
+Tesla® P100)](#details_for_nvidia_dgx-1tm_nvidia_tesla_p100) section.
+
+### Training with NVIDIA® Tesla® K80
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_k80_single_server.png">
+</div>
+
+Details and additional results are in the [Details for Google Compute Engine
+(NVIDIA® Tesla® K80)](#details_for_google_compute_engine_nvidia_tesla_k80) and
+[Details for Amazon EC2 (NVIDIA® Tesla®
+K80)](#details_for_amazon_ec2_nvidia_tesla_k80) sections.
+
+### Distributed training with NVIDIA® Tesla® K80
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_k80_aws_distributed.png">
+</div>
+
+Details and additional results are in the [Details for Amazon EC2 Distributed
+(NVIDIA® Tesla® K80)](#details_for_amazon_ec2_distributed_nvidia_tesla_k80)
+section.
+
+### Compare synthetic with real data training
+
+**NVIDIA® Tesla® P100**
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_summary_p100_data_compare_inceptionv3.png">
+  <img style="width:35%" src="../images/perf_summary_p100_data_compare_resnet50.png">
+</div>
+
+**NVIDIA® Tesla® K80**
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_summary_k80_data_compare_inceptionv3.png">
+  <img style="width:35%" src="../images/perf_summary_k80_data_compare_resnet50.png">
+</div>
+
+## Details for NVIDIA® DGX-1™ (NVIDIA® Tesla® P100)
+
+### Environment
+
+*   **Instance type**: NVIDIA® DGX-1™
+*   **GPU:** 8x NVIDIA® Tesla® P100
+*   **OS:** Ubuntu 16.04 LTS with tests run via Docker
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** Local SSD
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3, ResNet-50,
+ResNet-152, and VGG16 were tested with a batch size of 32. Those results are in
+the *other results* section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+------------------ | ----------- | --------- | ---------- | ------- | -----
+Batch size per GPU | 64          | 64        | 64         | 512     | 64
+Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
+
+Configuration used for each model.
+
+Model       | variable_update        | local_parameter_device
+----------- | ---------------------- | ----------------------
+InceptionV3 | parameter_server       | cpu
+ResNet50    | parameter_server       | cpu
+ResNet152   | parameter_server       | cpu
+AlexNet     | replicated (with NCCL) | n/a
+VGG16       | replicated (with NCCL) | n/a
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_p100_single_server.png">
+</div>
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_dgx1_synth_p100_single_server_scaling.png">
+  <img style="width:35%" src="../images/perf_dgx1_real_p100_single_server_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 142         | 219       | 91.8       | 2987    | 154
+2    | 284         | 422       | 181        | 5658    | 295
+4    | 569         | 852       | 356        | 10509   | 584
+8    | 1131        | 1734      | 716        | 17822   | 1081
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 142         | 218       | 91.4       | 2890    | 154
+2    | 278         | 425       | 179        | 4448    | 284
+4    | 551         | 853       | 359        | 7105    | 534
+8    | 1079        | 1630      | 708        | N/A     | 898
+
+Training AlexNet with real data on 8 GPUs was excluded from the graph and table
+above due to it maxing out the input pipeline.
+
+### Other Results
+
+The results below are all with a batch size of 32.
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
+---- | ----------- | --------- | ---------- | -----
+1    | 128         | 195       | 82.7       | 144
+2    | 259         | 368       | 160        | 281
+4    | 520         | 768       | 317        | 549
+8    | 995         | 1485      | 632        | 820
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
+---- | ----------- | --------- | ---------- | -----
+1    | 130         | 193       | 82.4       | 144
+2    | 257         | 369       | 159        | 253
+4    | 507         | 760       | 317        | 457
+8    | 966         | 1410      | 609        | 690
+
+## Details for Google Compute Engine (NVIDIA® Tesla® K80)
+
+### Environment
+
+*   **Instance type**: n1-standard-32-k80x8
+*   **GPU:** 8x NVIDIA® Tesla® K80
+*   **OS:** Ubuntu 16.04 LTS
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** 1.7 TB Shared SSD persistent disk (800 MB/s)
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
+tested with a batch size of 32. Those results are in the *other results*
+section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+------------------ | ----------- | --------- | ---------- | ------- | -----
+Batch size per GPU | 64          | 64        | 32         | 512     | 32
+Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
+
+The configuration used for each model was `variable_update` equal to
+`parameter_server` and `local_parameter_device` equal to `cpu`.
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_gce_synth_k80_single_server_scaling.png">
+  <img style="width:35%" src="../images/perf_gce_real_k80_single_server_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.5        | 51.9      | 20.0       | 656     | 35.4
+2    | 57.8        | 99.0      | 38.2       | 1209    | 64.8
+4    | 116         | 195       | 75.8       | 2328    | 120
+8    | 227         | 387       | 148        | 4640    | 234
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.6        | 51.2      | 20.0       | 639     | 34.2
+2    | 58.4        | 98.8      | 38.3       | 1136    | 62.9
+4    | 115         | 194       | 75.4       | 2067    | 118
+8    | 225         | 381       | 148        | 4056    | 230
+
+### Other Results
+
+**Training synthetic data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.3                        | 49.5
+2    | 55.0                        | 95.4
+4    | 109                         | 183
+8    | 216                         | 362
+
+**Training real data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.5                        | 49.3
+2    | 55.4                        | 95.3
+4    | 110                         | 186
+8    | 216                         | 359
+
+## Details for Amazon EC2 (NVIDIA® Tesla® K80)
+
+### Environment
+
+*   **Instance type**: p2.8xlarge
+*   **GPU:** 8x NVIDIA® Tesla® K80
+*   **OS:** Ubuntu 16.04 LTS
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** 1TB Amazon EFS (burst 100 MiB/sec for 12 hours, continuous 50
+    MiB/sec)
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
+tested with a batch size of 32. Those results are in the *other results*
+section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+------------------ | ----------- | --------- | ---------- | ------- | -----
+Batch size per GPU | 64          | 64        | 32         | 512     | 32
+Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
+
+Configuration used for each model.
+
+Model       | variable_update           | local_parameter_device
+----------- | ------------------------- | ----------------------
+InceptionV3 | parameter_server          | cpu
+ResNet-50   | replicated (without NCCL) | gpu
+ResNet-152  | replicated (without NCCL) | gpu
+AlexNet     | parameter_server          | gpu
+VGG16       | parameter_server          | gpu
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_aws_synth_k80_single_server_scaling.png">
+  <img style="width:35%" src="../images/perf_aws_real_k80_single_server_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.8        | 51.5      | 19.7       | 684     | 36.3
+2    | 58.7        | 98.0      | 37.6       | 1244    | 69.4
+4    | 117         | 195       | 74.9       | 2479    | 141
+8    | 230         | 384       | 149        | 4853    | 260
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.5        | 51.3      | 19.7       | 674     | 36.3
+2    | 59.0        | 94.9      | 38.2       | 1227    | 67.5
+4    | 118         | 188       | 75.2       | 2201    | 136
+8    | 228         | 373       | 149        | N/A     | 242
+
+Training AlexNet with real data on 8 GPUs was excluded from the graph and table
+above due to our EFS setup not providing enough throughput.
+
+### Other Results
+
+**Training synthetic data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.9                        | 49.0
+2    | 57.5                        | 94.1
+4    | 114                         | 184
+8    | 216                         | 355
+
+**Training real data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 30.0                        | 49.1
+2    | 57.5                        | 95.1
+4    | 113                         | 185
+8    | 212                         | 353
+
+## Details for Amazon EC2 Distributed (NVIDIA® Tesla® K80)
+
+### Environment
+
+*   **Instance type**: p2.8xlarge
+*   **GPU:** 8x NVIDIA® Tesla® K80
+*   **OS:** Ubuntu 16.04 LTS
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** 1.0 TB EFS (burst 100 MB/sec for 12 hours, continuous 50 MB/sec)
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+The batch size and optimizer used for the tests are listed in the table. In
+addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
+tested with a batch size of 32. Those results are in the *other results*
+section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152
+------------------ | ----------- | --------- | ----------
+Batch size per GPU | 64          | 64        | 32
+Optimizer          | sgd         | sgd       | sgd
+
+Configuration used for each model.
+
+Model       | variable_update        | local_parameter_device | cross_replica_sync
+----------- | ---------------------- | ---------------------- | ------------------
+InceptionV3 | distributed_replicated | n/a                    | True
+ResNet-50   | distributed_replicated | n/a                    | True
+ResNet-152  | distributed_replicated | n/a                    | True
+
+To simplify server setup, EC2 instances (p2.8xlarge) running worker servers also
+ran parameter servers. Equal numbers of parameter servers and worker servers were
+used with the following exceptions:
+
+*   InceptionV3: 8 instances / 6 parameter servers
+*   ResNet-50: (batch size 32) 8 instances / 4 parameter servers
+*   ResNet-152: 8 instances / 4 parameter servers
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_k80_aws_distributed.png">
+</div>
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:70%" src="../images/perf_aws_synth_k80_distributed_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152
+---- | ----------- | --------- | ----------
+1    | 29.7        | 52.4      | 19.4
+8    | 229         | 378       | 146
+16   | 459         | 751       | 291
+32   | 902         | 1388      | 565
+64   | 1783        | 2744      | 981
+
+### Other Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:50%" src="../images/perf_aws_synth_k80_multi_server_batch32.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.2                        | 48.4
+8    | 219                         | 333
+16   | 427                         | 667
+32   | 820                         | 1180
+64   | 1608                        | 2315
+
+## Methodology
+
+This
+[script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+was run on the various platforms to generate the above results.
+@{$performance_models$High-Performance Models} details techniques in the script
+along with examples of how to execute the script.
+
+In order to create results that are as repeatable as possible, each test was run
+5 times and then the times were averaged together. GPUs are run in their default
+state on the given platform. For NVIDIA® Tesla® K80 this means leaving on [GPU
+Boost](https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/).
+For each test, 10 warmup steps are done and then the next 100 steps are
+averaged.
diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md
index 0ff4d2ee0041ee142c65a9975b55c89387612a26..7c1cd152d372cdf0f4506b17b15cc8a816088bd7 100644
--- a/tensorflow/docs_src/performance/index.md
+++ b/tensorflow/docs_src/performance/index.md
@@ -2,11 +2,19 @@
 
 Performance is often a significant issue when training a machine learning
 model.  This section explains various ways to optimize performance.  Start
-your investigation with the following guide:
+your investigation with the @{$performance_guide$Performance Guide} and then go
+deeper with techniques detailed in @{$performance_models$High-Performance Models}:
 
-  * @{$performance_guide$Performance}, which contains a collection of best
+  * @{$performance_guide$Performance Guide}, which contains a collection of best
     practices for optimizing your TensorFlow code.
 
+  * @{$performance_models$High-Performance Models}, which contains a collection
+    of advanced techniques to build highly scalable models targeting different
+    system types and network topologies.
+
+  * @{$benchmarks$Benchmarks}, which contains a collection of benchmark
+    results.
+
 XLA (Accelerated Linear Algebra) is an experimental compiler for linear
 algebra that optimizes TensorFlow computations. The following guides explore
 XLA:
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index 0f30cc7fa5c8a6a5d2501b75dba0a86365ab5aae..d22847322084d584a4ddc713486109ede838fee8 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,4 +1,8 @@
 performance_guide.md
+performance_models.md
+benchmarks.md
+quantization.md
+>>>
 xla/index.md
 xla/broadcasting.md
 xla/developing_new_backend.md
@@ -6,4 +10,3 @@ xla/jit.md
 xla/operation_semantics.md
 xla/shapes.md
 xla/tfcompile.md
-quantization.md
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 8a1bba883aeee93c7702c936c6130d51cc552457..07c5d3087f35e6a3dbe7369006d1a4d84517e9e4 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -1,8 +1,10 @@
-# Performance
+# Performance Guide
 
 This guide contains a collection of best practices for optimizing your
 TensorFlow code. The best practices apply to both new and experienced
-Tensorflow users.
+Tensorflow users.  As a complement to the best practices in this document, the
+@{$performance_models$High-Performance Models} document links to example code
+and details for creating models that scale on a variety of hardware.
 
 ## Best Practices
 While optimizing implementations of different types of models can be different,
@@ -73,7 +75,7 @@ Unless for a special circumstance or for example code, do not feed data
 into the session from Python variables, e.g. `dictionary`.
 
 ```python
-# This will result in poor performance.
+# Using feed_dict often results in suboptimal performance when using large inputs.
 sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
 ```
 
@@ -141,3 +143,4 @@ bn = tf.contrib.layers.batch_norm(
 The non-fused batch norm does computations using several individual Ops. Fused
 batch norm combines the individual operations into a single kernel, which runs
 faster.
+
diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..f384a5111461585010f33a68922f6423f20a9fff
--- /dev/null
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -0,0 +1,422 @@
+# High-Performance Models
+
+This document and accompanying
+[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+detail how to build highly scalable models that target a variety of system types
+and network topologies. The techniques in this document utilize some low-level
+TensorFlow Python primitives. In the future, many of these techniques will be
+incorporated into high-level APIs.
+
+## Input Pipeline
+
+The @{$performance_guide$Performance Guide} explains how to identify possible
+input pipeline issues and best practices. We found that using @{tf.FIFOQueue}
+and @{tf.train.queue_runner} could not saturate multiple current generation GPUs
+when using large inputs and processing with higher samples per second, such
+as training ImageNet with [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
+This is due to the use of Python threads as its underlying implementation. The
+overhead of Python threads is too large.
+
+Another approach, which we have implemented in the
+[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks),
+is to build an input pipeline using the native parallelism in TensorFlow. Our
+implementation is made up of 3 stages:
+
+*   I/O reads: Choose and read image files from disk.
+*   Image Processing: Decode image records into images, preprocess, and organize
+    into mini-batches.
+*   CPU-to-GPU Data Transfer: Transfer images from CPU to GPU.
+
+The dominant part of each stage is executed in parallel with the other stages
+using `data_flow_ops.StagingArea`. `StagingArea` is a queue-like operator
+similar to @{tf.FIFOQueue}. The difference is that `StagingArea` offers simpler
+functionality and can be executed on both CPU and GPU in parallel with other
+stages. Breaking the input pipeline into 3 stages that operate independently in
+parallel is scalable and takes full advantage of large multi-core environments.
+The rest of this section details the stages followed by details about using
+`data_flow_ops.StagingArea`.
+
+### Parallelize I/O Reads
+
+`data_flow_ops.RecordInput` is used to parallelize reading from disk. Given a
+list of input files representing TFRecords, `RecordInput` continuously reads
+records using background threads. The records are placed into its own large
+internal pool and when it has loaded at least half of its capacity, it produces
+output tensors.
+
+This op has its own internal threads that are dominated by I/O time that consume
+minimal CPU, which allows it to run smoothly in parallel with the rest of the
+model.
+
+### Parallelize Image Processing
+
+After images are read from `RecordInput` they are passed as tensors to the image
+processing pipeline. To make the image processing pipeline easier to explain,
+assume that the input pipeline is targeting 8 GPUs with a batch size of 256 (32
+per GPU).
+
+256 records are read and processed individually in parallel. This starts with
+256 independent `RecordInput` read ops in the graph. Each read op is followed by
+an identical set of ops for image preprocessing that are considered independent
+and executed in parallel. The image preprocessing ops include operations such as
+image decoding, distortion, and resizing.
+
+Once the images are through preprocessing, they are concatenated together into 8
+tensors each with a batch-size of 32. Rather than using @{tf.concat} for this
+purpose, which is implemented as a single op that waits for all the inputs to be
+ready before concatenating them together, @{tf.parallel_stack} is used.
+@{tf.parallel_stack} allocates an uninitialized tensor as an output, and each
+input tensor is written to its designated portion of the output tensor as soon
+as the input is available.
+
+When all the input tensors are finished, the output tensor is passed along in
+the graph. This effectively hides all the memory latency with the long tail of
+producing all the input tensors.
+
+### Parallelize CPU-to-GPU Data Transfer
+
+Continuing with the assumption that the target is 8 GPUs with a batch size of
+256 (32 per GPU). Once the input images are processed and concatenated together
+by the CPU, we have 8 tensors each with a batch-size of 32.
+
+TensorFlow enables tensors from one device to be used on any other device
+directly. TensorFlow inserts implicit copies to make the tensors available on
+any devices where they are used. The runtime schedules the copy between devices
+to run before the tensors are actually used. However, if the copy cannot finish
+in time, the computation that needs those tensors will stall and result in
+decreased performance.
+
+In this implementation, `data_flow_ops.StagingArea` is used to explicitly
+schedule the copy in parallel. The end result is that when computation starts on
+the GPU, all the tensors are already available.
+
+### Software Pipelining
+
+With all the stages capable of being driven by different processors,
+`data_flow_ops.StagingArea` is used between them so they run in parallel.
+`StagingArea` is a queue-like operator similar to @{tf.FIFOQueue} that offers
+simpler functionalities that can be executed on both CPU and GPU.
+
+Before the model starts running all the stages, the input pipeline stages are
+warmed up to prime the staging buffers in between with one set of data.
+During each run step, one set of data is read from the staging buffers at
+the beginning of each stage, and one set is pushed at the end.
+
+For example: if there are three stages: A, B and C. There are two staging areas
+in between: S1 and S2. During the warm up, we run:
+
+```
+Warm up:
+Step 1: A0
+Step 2: A1  B0
+
+Actual execution:
+Step 3: A2  B1  C0
+Step 4: A3  B2  C1
+Step 5: A4  B3  C2
+```
+
+After the warm up, S1 and S2 each have one set of data in them. For each step of
+the actual execution, one set of data is consumed from each staging area, and
+one set is added to each.
+
+Benefits of using this scheme:
+
+*   All stages are non-blocking, since the staging areas always have one set of
+    data after the warm up.
+*   Each stage can run in parallel since they can all start immediately.
+*   The staging buffers have a fixed memory overhead. They will have at most one
+    extra set of data.
+*   Only a single`session.run()` call is needed to run all stages of the step,
+    which makes profiling and debugging much easier.
+
+## Best Practices in Building High-Performance Models
+
+Collected below are a couple of additional best practices that can improve
+performance and increase the flexiblity of models.
+
+### Build the model with both NHWC and NCHW
+
+Most TensorFlow operations used by a CNN support both NHWC and NCHW data format.
+On GPU, NCHW is faster. But on CPU, NHWC is sometimes faster.
+
+Building a model to support both data formats keeps the model flexible and
+capable of operating optimally regardless of platform. Most TensorFlow
+operations used by a CNN support both NHWC and NCHW data formats. The benchmark
+script was written to support both NCHW and NHWC. NCHW should always be used
+when training with GPUs. NHWC is sometimes faster on CPU. A flexible model can
+be trained on GPUs using NCHW with inference done on CPU using NHWC with the
+weights obtained from training.
+
+### Use Fused Batch-Normalization
+
+The default batch-normalization in TensorFlow is implemented as composite
+operations. This is very general, but often leads to suboptimal performance. An
+alternative is to use fused batch-normalization which often has much better
+performance on GPU. Below is an example of using @{tf.contrib.layers.batch_norm}
+to implement fused batch-normalization.
+
+```python
+bn = tf.contrib.layers.batch_norm(
+          input_layer, fused=True, data_format='NCHW'
+          scope=scope)
+```
+
+## Variable Distribution and Gradient Aggregation
+
+During training, training variable values are updated using aggregated gradients
+and deltas. In the benchmark script, we demonstrate that with the flexible and
+general-purpose TensorFlow primitives, a diverse range of high-performance
+distribution and aggregation schemes can be built.
+
+Three examples of variable distribution and aggregation were included in the
+script:
+
+*   `parameter_server` where each replica of the training model reads the
+    variables from a parameter server and updates the variable independently.
+    When each model needs the variables, they are copied over through the
+    standard implicit copies added by the TensorFlow runtime. The example
+    [script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+    illustrates using this method for local training, distributed synchronous
+    training, and distributed asynchronous training.
+*   `replicated` places an identical copy of each training variable on each
+    GPU. The forward and backward computation can start immediately as the
+    variable data is immediately available. Gradients are accumulated across all
+    GPUs, and the aggregated total is applied to each GPU's copy of the
+    variables to keep them in sync.
+*   `distributed_replicated` places an identical copy of the training parameters
+    on each GPU along with a master copy on the parameter servers. The forward
+    and backward computation can start immediately as the variable data is
+    immediately available. Gradients are accumulated across all GPUs on each
+    server and then the per-server aggregated gradients are applied to the
+    master copy. After all workers do this, each worker updates its copy of the
+    variable from the master copy.
+
+Below are additional details about each approach.
+
+### Parameter Server Variables
+
+The most common way trainable variables are managed in TensorFlow models is
+parameter server mode.
+
+In a distributed system, each worker process runs the same model, and parameter
+server processes own the master copies of the variables. When a worker needs a
+variable from a parameter server, it refers to it directly. The TensorFlow
+runtime adds implicit copies to the graph to make the variable value available
+on the computation device that needs it. When a gradient is computed on a
+worker, it is sent to the parameter server that owns the particular variable,
+and the corresponding optimizer is used to update the variable.
+
+There are some techniques to improve throughput:
+
+*   The variables are spread among parameter servers based on their size, for
+    load balancing.
+*   When each worker has multiple GPUs, gradients are accumulated across the
+    GPUs and a single aggregated gradient is sent to the parameter server. This
+    reduces the network bandwidth and the amount of work done by the parameter
+    servers.
+
+For coordinating between workers, a very common mode is async updates, where
+each worker updates the master copy of the variables without synchronizing with
+other workers. In our model, we demonstrate that it is fairly easy to introduce
+synchronization across workers so updates for all workers are finished in one
+step before the next step can start.
+
+The parameter server method can also be used for local training, In this case,
+instead of spreading the master copies of variables across parameters servers,
+they are either on the CPU or spread across the available GPUs.
+
+Due to the simple nature of this setup, this architecture has gained a lot of
+popularity within the community.
+
+This mode can be used in the script by passing
+`--variable_update=parameter_server`.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" alt="parameter_server mode in distributed training"
+   src="../images/perf_parameter_server_mode_doc.png">
+</div>
+
+### Replicated Variables
+
+In this design, each GPU on the server has its own copy of each variable. The
+values are kept in sync across GPUs by applying the fully aggregated gradient to
+each GPU's copy of the variable.
+
+The variables and data are available at the start of training, so the forward
+pass of training can start immediately. Gradients are aggregated across the
+devices and the fully aggregated gradient is then applied to each local copy.
+
+Gradient aggregation across the server can be done in different ways:
+
+*   Using standard TensorFlow operations to accumulate the total on a single
+    device (CPU or GPU) and then copy it back to all GPUs.
+*   Using NVIDIA® NCCL, described below in the NCCL section.
+
+This mode can be used in the script by passing `--variable_update=replicated`.
+
+### Replicated Variables in Distributed Training
+
+The replicated method for variables can be extended to distributed training. One
+way to do this like the replicated mode: aggregate the gradients fully across
+the cluster and apply them to each local copy of the variable. This may be shown
+in a future version of this scripts; the scripts do present a different
+variation, described here.
+
+In this mode, in addition to each GPU's copy of the variables, a master copy is
+stored on the parameter servers. As with the replicated mode, training can start
+immediately using the local copies of the variables.
+
+As the gradients of the weights become available, they are sent back to the
+parameter servers and all local copies are updated:
+
+1.  All the gradients from the GPU on the same worker are aggregated together.
+2.  Aggregated gradients from each worker are sent to the parameter server that
+    owns the variable, where the specified optimizer is used to update the
+    master copy of the variable.
+3.  Each worker updates its local copy of the variable from the master. In the
+    example model, this is done with a cross-replica barrier that waits for all
+    the workers to finish updating the variables, and fetches the new variable
+    only after the barrier has been released by all replicas. Once the copy
+    finishes for all variables, this marks the end of a training step, and a new
+    step can start.
+
+Although this sounds similar to the standard use of parameter servers, the
+performance is often better in many cases. This is largely due to the fact the
+computation can happen without any delay, and much of the copy latency of early
+gradients can be hidden by later computation layers.
+
+This mode can be used in the script by passing
+`--variable_update=distributed_replicated`.
+
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" alt="distributed_replicated mode"
+   src="../images/perf_distributed_replicated_mode_doc.png">
+</div>
+
+#### NCCL
+
+In order to broadcast variables and aggregate gradients across different GPUs
+within the same host machine, we can use the default TensorFlow implicit copy
+mechanism.
+
+However, we can instead use the optional NCCL (@{tf.contrib.nccl}) support. NCCL
+is an NVIDIA® library that can efficiently broadcast and aggregate data across
+different GPUs. It schedules a cooperating kernel on each GPU that knows how to
+best utilize the underlying hardware topology; this kernel uses a single SM of
+the GPU.
+
+In our experiment, we demonstrate that although NCCL often leads to much faster
+data aggregation by itself, it doesn't necessarily lead to faster training. Our
+hypothesis is that the implicit copies are essentially free since they go to the
+copy engine on GPU, as long as its latency can be hidden by the main computation
+itself. Although NCCL can transfer data faster, it takes one SM away, and adds
+more pressure to the underlying L2 cache. Our results show that for 8-GPUs, NCCL
+often leads to better performance. However, for fewer GPUs, the implicit copies
+often perform better.
+
+#### Staged Variables
+
+We further introduce a staged-variable mode where we use staging areas for both
+the variable reads, and their updates. Similar to software pipelining of the
+input pipeline, this can hide the data copy latency. If the computation time
+takes longer than the copy and aggregation, the copy itself becomes essentially
+free.
+
+The downside is that all the weights read are from the previous training step.
+So it is a different algorithm from SGD. But it is possible to improve its
+convergence by adjusting learning rate and other hyperparameters.
+
+## Executing the script
+
+This section lists the core command line arguments and a few basic examples for
+executing the main script
+([tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)).
+
+> Note: `tf_cnn_benchmarks.py` uses the config `force_gpu_compatible`,
+> which was introduced after TensorFlow 1.1. Until TensorFlow 1.2 is released
+> building from source is advised.
+
+#### Base command line arguments
+
+*   **`model`**: Model to use, e.g. `resnet50`, `inception3`, `vgg16`, and
+    `alexnet`.
+*   **`num_gpus`**: Number of GPUs to use.
+*   **`data_dir`**: Path to data to process. If not set, synthetic data is used.
+    To use Imagenet data use these
+    [instructions(https://github.com/tensorflow/models/tree/master/inception#getting-started)
+    as a starting point.
+*   **`batch_size`**: Batch size for each GPU.
+*   **`variable_update`**: The method for managing variables: `parameter_server`
+    ,`replicated`, `distributed_replicated`, `independent`
+*   **`local_parameter_device`**: Device to use as parameter server: `cpu` or
+    `gpu`.
+
+#### Single instance examples
+
+```bash
+# VGG16 training ImageNet with 8 GPUs using arguments that optimize for
+# Google Compute Engine.
+python tf_cnn_benchmarks.py --local_parameter_device=cpu --num_gpus=8 \
+--batch_size=32 --model=vgg16 --data_dir=/home/ubuntu/imagenet/train \
+--variable_update=parameter_server --nodistortions
+
+# VGG16 training synthetic ImageNet data with 8 GPUs using arguments that
+# optimize for the NVIDIA DGX-1.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=vgg16 --variable_update=replicated --use_nccl=True
+
+# VGG16 training ImageNet data with 8 GPUs using arguments that optimize for
+# Amazon EC2.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=vgg16 --variable_update=parameter_server
+
+# ResNet-50 training ImageNet data with 8 GPUs using arguments that optimize for
+# Amazon EC2.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=replicated --use_nccl=False
+
+```
+
+#### Distributed command line arguments
+
+*   **`ps_hosts`**: Comma separated list of hosts to use as parameter servers
+    in the format of ```<host>:port```, e.g. ```10.0.0.2:50000```.
+*   **`worker_hosts`**: Comma separated list of hosts to use as workers in the
+    format of ```<host>:port```, e.g. ```10.0.0.2:50001```.
+*   **`task_index`**: Index of the host in the list of `ps_hosts` or
+    `worker_hosts` being started.
+*   **`job_name`**: Type of job, e.g `ps` or `worker`
+
+#### Distributed examples
+
+Below is an example of training ResNet-50 on 2 hosts: host_0 (10.0.0.1) and
+host_1 (10.0.0.2). The example uses synthetic data. To use real data pass the
+`--data_dir` argument.
+
+```bash
+# Run the following commands on host_0 (10.0.0.1):
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0
+
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0
+
+
+# Run the following commands on host_1 (10.0.0.2):
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1
+
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1
+
+```
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index ad23bab443d30d638480fa6a9be85e9b7a3837a7..49c25027fc9502f9ad37819930817cd2ecf3cd65 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -143,13 +143,13 @@ conversion functions before and after to move the data between float and
 eight-bit. Below is an example of what they look like. First here's the original
 Relu operation, with float inputs and outputs:
 
-![Relu Diagram](https://www.tensorflow.org/../images/quantization0.png)
+![Relu Diagram](https://www.tensorflow.org/images/quantization0.png)
 
 Then, this is the equivalent converted subgraph, still with float inputs and
 outputs, but with internal conversions so the calculations are done in eight
 bit.
 
-![Converted Diagram](https://www.tensorflow.org/../images/quantization1.png)
+![Converted Diagram](https://www.tensorflow.org/images/quantization1.png)
 
 The min and max operations actually look at the values in the input float
 tensor, and then feeds them into the Dequantize operation that converts the
@@ -162,7 +162,7 @@ operations that all have float equivalents, then there will be a lot of adjacent
 Dequantize/Quantize ops. This stage spots that pattern, recognizes that they
 cancel each other out, and removes them, like this:
 
-![Stripping Diagram](https://www.tensorflow.org/../images/quantization2.png)
+![Stripping Diagram](https://www.tensorflow.org/images/quantization2.png)
 
 Applied on a large scale to models where all of the operations have quantized
 equivalents, this gives a graph where all of the tensor calculations are done in
diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md
index 9c23e79845d3e58b57edeb11a92a62b0e136a83a..d2c184332797c3ec5d17833813fdba97d930c3f1 100644
--- a/tensorflow/docs_src/performance/xla/index.md
+++ b/tensorflow/docs_src/performance/xla/index.md
@@ -62,7 +62,7 @@ well as the NVIDIA GPU backend are in the TensorFlow source tree.
 The following diagram shows the compilation process in XLA:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img src="../../images/how-does-xla-work.png">
+  <img src="https://www.tensorflow.org/images/how-does-xla-work.png">
 </div>
 
 XLA comes with several optimizations and analyses that are target-independent,
diff --git a/tensorflow/docs_src/performance/xla/jit.md b/tensorflow/docs_src/performance/xla/jit.md
index 4d2a643b7f837e31d485cb72806d8d80429d9ad2..d4dc3e57c8fb5ec2a979b6ba7ebe2a3b6c3a5f94 100644
--- a/tensorflow/docs_src/performance/xla/jit.md
+++ b/tensorflow/docs_src/performance/xla/jit.md
@@ -124,7 +124,7 @@ open the timeline file created when the script finishes: `timeline.ctf.json`.
 The rendered timeline should look similar to the picture below with multiple
 green boxes labeled `MatMul`, possibly across multiple CPUs.
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/jit_timeline_gpu.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/jit_timeline_gpu.png">
 </div>
 
 ### Step #3 Run with XLA
@@ -139,7 +139,7 @@ TF_XLA_FLAGS=--xla_generate_hlo_graph=.* python mnist_softmax_xla.py
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
 should look similar to the picture below with one long bar labeled `_XlaLaunch`.
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/jit_timeline_gpu_xla.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/jit_timeline_gpu_xla.png">
 </div>
 
 To understand what is happening in `_XlaLaunch`, look at the console output for
@@ -165,5 +165,5 @@ dot -Tpng hlo_graph_80.dot -o hlo_graph_80.png
 
 The result will look like the following:
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/jit_gpu_xla_graph.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/jit_gpu_xla_graph.png">
 </div>
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index d5eeb5d7d5f307f8eaa7d5c08f5c11043f068c00..424c994e72d0c44966f164a798f8ebdddf86999a 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -178,7 +178,7 @@ Concat({a, b}, 0)
 
 Diagram:
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_concatenate.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_concatenate.png">
 </div>
 
 ## ConvertElementType
@@ -707,7 +707,7 @@ are all 0. Figure below shows examples of different `edge_padding` and
 `interior_padding` values for a two dimensional array.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_pad.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
 </div>
 
 ## Reduce
@@ -781,13 +781,13 @@ Here's an example of reducing a 2D array (matrix). The shape has rank 2,
 dimension 0 of size 2 and dimension 1 of size 3:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_2d_matrix.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_2d_matrix.png">
 </div>
 
 Results of reducing dimensions 0 or 1 with an "add" function:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_reduce_from_2d_matrix.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_2d_matrix.png">
 </div>
 
 Note that both reduction results are 1D arrays. The diagram shows one as column
@@ -798,7 +798,7 @@ size 4, dimension 1 of size 2 and dimension 2 of size 3. For simplicity, the
 values 1 to 6 are replicated across dimension 0.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_reduce_from_3d_matrix.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_3d_matrix.png">
 </div>
 
 Similarly to the 2D example, we can reduce just one dimension. If we reduce
@@ -890,7 +890,7 @@ builder.ReduceWindow(
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_reduce_window.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_window.png">
 </div>
 
 Stride of 1 in a dimension specifies that the position of a window in the
@@ -902,7 +902,7 @@ are the same as though the input came in with the dimensions it has after
 padding.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:75%" src="../../images/ops_reduce_window_stride.png">
+  <img style="width:75%" src="https://www.tensorflow.org/images/ops_reduce_window_stride.png">
 </div>
 
 The evaluation order of the reduction function is arbitrary and may be
@@ -1144,7 +1144,7 @@ addition `scatter` function produces the output element of value 8 (2 + 6).
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%"
-    src="../../images/ops_scatter_to_selected_window_element.png">
+    src="https://www.tensorflow.org/images/ops_scatter_to_selected_window_element.png">
 </div>
 
 The evaluation order of the `scatter` function is arbitrary and may be
@@ -1482,5 +1482,5 @@ while (result(0) < 1000) {
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_while.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_while.png">
 </div>
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 7801fadb47554ce14a4726b97138e67be288f25e..78819969b71a88e772813a73f4d1907fccbddec9 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -24,7 +24,7 @@ This code trains a simple NN for MNIST digit image recognition. Notice that the
 accuracy increases slightly after the first training step, but then gets stuck
 at a low (near-chance) level:
 
-![debug_mnist training fails](../images/tfdbg_screenshot_mnist_symptom.png)
+![debug_mnist training fails](https://www.tensorflow.org/images/tfdbg_screenshot_mnist_symptom.png)
 
 Scratching your head, you suspect that certain nodes in the training graph
 generated bad numeric values such as `inf`s and `nan`s. The computation-graph
@@ -89,7 +89,7 @@ The debug wrapper session will prompt you when it is about to execute the first
 `run()` call, with information regarding the fetched tensor and feed
 dictionaries displayed on the screen.
 
-![tfdbg run-start UI](../images/tfdbg_screenshot_run_start.png)
+![tfdbg run-start UI](https://www.tensorflow.org/images/tfdbg_screenshot_run_start.png)
 
 This is what we refer to as the *run-start UI*. If the screen size is
 too small to display the content of the message in its entirety, you can resize
@@ -108,7 +108,7 @@ intermediate tensors from the run. (These tensors can also be obtained by
 running the command `lt` after you executed `run`.) This is called the
 **run-end UI**:
 
-![tfdbg run-end UI: accuracy](../images/tfdbg_screenshot_run_end_accuracy.png)
+![tfdbg run-end UI: accuracy](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_accuracy.png)
 
 ### tfdbg CLI Frequently-Used Commands
 
@@ -181,7 +181,7 @@ screen with a red-colored title line indicating **tfdbg** stopped immediately
 after a `run()` call generated intermediate tensors that passed the specified
 filter `has_inf_or_nan`:
 
-![tfdbg run-end UI: infs and nans](../images/tfdbg_screenshot_run_end_inf_nan.png)
+![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_inf_nan.png)
 
 As the screen display indicates, the `has_inf_or_nan` filter is first passed
 during the fourth `run()` call: an [Adam optimizer](https://arxiv.org/abs/1412.6980)
@@ -220,7 +220,7 @@ item on the top or entering the equivalent command:
 tfdbg> ni cross_entropy/Log
 ```
 
-![tfdbg run-end UI: infs and nans](../images/tfdbg_screenshot_run_end_node_info.png)
+![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_node_info.png)
 
 You can see that this node has the op type `Log`
 and that its input is the node `softmax/Softmax`. Run the following command to
@@ -263,7 +263,7 @@ simply click the underlined line numbers in the stack trace output of the
 `ni -t <op_name>` commands, or use the `ps` (or `print_source`) command such as:
 `ps /path/to/source.py`. See the screenshot below for an example of `ps` output:
 
-![tfdbg run-end UI: annotated Python source file](../images/tfdbg_screenshot_run_end_annotated_source.png)
+![tfdbg run-end UI: annotated Python source file](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_annotated_source.png)
 
 Apply a value clipping on the input to @{tf.log}
 to resolve this problem:
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 309b39451fd11d7185359209a41f0a9dbb8efdb0..acdca2bad4f4765173a239b3e10ed9a700e4b637 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -39,6 +39,11 @@ trained graph.  The following guide details `MetaGraph` objects:
 
   * @{$meta_graph$Exporting and Importing a MetaGraph}.
 
+`SavedModel` is the universal serialization format for Tensorflow models. TensorFlow provides SavedModel CLI (command-line interface) as a tool to inspect and execute a MetaGraph in a SavedModel. The detailed usages and examples are
+documented in the following guide:
+
+  * @{$saved_model_cli$SavedModel CLI (Command-Line Interface)}.
+
 To learn about the TensorFlow versioning scheme, consult the following two
 guides:
 
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index d397917219097084c7ab23070986a9769ae37a4b..322e11cbd697ab427bc4857647234e2a9014ae6a 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -8,6 +8,7 @@ supervisor.md
 debugger.md
 tfdbg-tflearn.md
 meta_graph.md
+saved_model_cli.md
 version_semantics.md
 data_versions.md
 faq.md
diff --git a/tensorflow/docs_src/programmers_guide/reading_data.md b/tensorflow/docs_src/programmers_guide/reading_data.md
index 7c3a37417d7b220653167472aa250f2273bf2d5e..088724337e485d65b5263fcc8d89134431c389d8 100644
--- a/tensorflow/docs_src/programmers_guide/reading_data.md
+++ b/tensorflow/docs_src/programmers_guide/reading_data.md
@@ -133,7 +133,7 @@ uses a file format where each record is represented using a fixed number of
 bytes: 1 byte for the label followed by 3072 bytes of image data. Once you have
 a uint8 tensor, standard operations can slice out each piece and reformat as
 needed. For CIFAR-10, you can see how to do the reading and decoding in
-[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py)
+[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py)
 and described in
 @{$deep_cnn#prepare-the-data$this tutorial}.
 
@@ -170,7 +170,7 @@ You can then do any preprocessing of these examples you want. This would be any
 processing that doesn't depend on trainable parameters. Examples include
 normalization of your data, picking a random slice, adding noise or distortions,
 etc.  See
-[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py)
+[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py)
 for an example.
 
 ### Batching
@@ -309,7 +309,7 @@ operations, so that our training loop can dequeue examples from the example
 queue.
 
 <div style="width:70%; margin-left:12%; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/AnimatedFileQueues.gif">
+<img style="width:100%" src="https://www.tensorflow.org/images/AnimatedFileQueues.gif">
 </div>
 
 The helpers in `tf.train` that create these queues and enqueuing operations add
diff --git a/tensorflow/docs_src/programmers_guide/saved_model_cli.md b/tensorflow/docs_src/programmers_guide/saved_model_cli.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb9e60e42ed4346fe78e16eabf8401c34e87c17e
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/saved_model_cli.md
@@ -0,0 +1,251 @@
+# SavedModel CLI (Command-Line Interface)
+
+[`SavedModel`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md)
+is a universal serialization format for Tensorflow. It provides a
+language-neutral format to save machine-learned models and enables higher-level
+systems and tools to produce, consume and transform TensorFlow models.
+
+We provide SavedModel CLI(command-line interface) as a tool to inspect and
+execute a [`MetaGraph`](https://www.tensorflow.org/programmers_guide/meta_graph)
+in a SavedModel. You can inspect for example, what
+[`SignatureDefs`](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/g3doc/signature_defs.md),
+including their input and output tensors, are in the model without writing any
+code. This can be useful in situations such as when you want to quickly check
+your input dtype and shape match with the model. Moreover, if you want to test
+out the model, it also allows you to do a sanity check by passing in sample
+inputs in the format of for example, python expressions, and fetch the outputs
+simply through command line.
+
+## Get SavedModel CLI
+
+If TensorFlow is installed on your system through pip, the `saved_model_cli`
+binary can be invoked directly from command line.
+
+To build the binary from source, run the following command:
+
+```
+$bazel build tensorflow/python/tools:saved_model_cli
+```
+
+## Commands
+
+SavedModel CLI allows users to both show and run computations on a
+[`MetaGraphDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
+in a SavedModel. These are done through `show` and `run` commands. We will
+explain the usages of both commands with detailed examples. SavedModel CLI will
+also display this information with `-h` option.
+
+### `show` command
+
+A SavedModel contains one or more MetaGraphs, identified by their tag-sets. Each
+MetaGraph contains both a TensorFlow GraphDef as well as associated metadata
+necessary for running computation in a graph. In order to serve a model, you
+might wonder what kind of SignatureDefs are in each model, and what are their
+inputs and outputs etc. The `show` command let you examine the content of the
+SavedModel in a hierarchical order.
+
+```
+usage: saved_model_cli show [-h] --dir DIR [--all]
+[--tag_set TAG_SET] [--signature_def SIGNATURE_DEF_KEY]
+```
+
+#### Examples
+
+To show all available MetaGraphDef tag-sets in the SavedModel:
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir
+The given SavedModel contains the following tag-sets:
+serve
+serve, gpu
+```
+
+To show all available SignatureDef keys in a MetaGraphDef:
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir --tag_set serve
+The given SavedModel MetaGraphDef contains SignatureDefs with the following keys:
+SignatureDef key: "classify_x2_to_y3"
+SignatureDef key: "classify_x_to_y"
+SignatureDef key: "regress_x2_to_y3"
+SignatureDef key: "regress_x_to_y"
+SignatureDef key: "regress_x_to_y2"
+SignatureDef key: "serving_default"
+```
+
+For a MetaGraphDef with multiple tags in the tag-set, all tags must be passed
+in, separated by ',':
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir --tag_set serve,gpu
+```
+
+To show all inputs and outputs TensorInfo for a specific SignatureDef, pass in
+the SignatureDef key to `signature_def` option. This is very useful when you
+want to know the tensor key value, dtype and shape of the input tensors for
+executing the computation graph later.
+
+```
+$saved_model_cli show --dir \
+/tmp/saved_model_dir --tag_set serve --signature_def serving_default
+The given SavedModel SignatureDef contains the following input(s):
+inputs['x'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['y'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/predict
+```
+
+To show all available information in the SavedModel, use `--all` option:
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir --all
+MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
+
+signature_def['classify_x2_to_y3']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x2:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['scores'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y3:0
+Method name is: tensorflow/serving/classify
+
+...
+
+signature_def['serving_default']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['x'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['y'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/predict
+```
+
+### `run` command
+
+SavedModel CLI also allows you to run a graph computation by passing in inputs,
+displaying, and saving the outputs.
+
+```
+usage: saved_model_cli run [-h] --dir DIR --tag_set TAG_SET --signature_def
+                           SIGNATURE_DEF_KEY [--inputs INPUTS]
+                           [--input_exprs INPUT_EXPRS] [--outdir OUTDIR]
+                           [--overwrite] [--tf_debug]
+```
+
+Tensor keys are used to specify which input we are passing in the values for.
+There are two ways to pass inputs to the model. With '--inputs' option, you can
+pass in numpy ndarray by files. With '--input_exprs' option, you can use python
+expressions as inputs.
+
+#### Input By File
+
+To pass in inputs by files, use `--inputs` option in the format of
+`<input_key>=<filename>`, or `<input_key>=<filename>[<variable_name>]`. Each
+input is separated by semicolon. File specified by `filename` will be loaded
+using `numpy.load`. Inputs can be loaded from only `.npy`, `.npz` or pickle
+files. The `variable_name` key is optional depending on the input file type as
+descripted in more details below.
+
+When loading from a `.npy` file, which always contains a numpy ndarray, the
+content will be directly assigned to the specified input tensor. If a
+`variable_name` is specified, it will be ignored and a warning will be issued.
+
+When loading from a `.npz` zip file, user can specify which variable within the
+zip file to load for the input tensor key with `variable_name`. If nothing is
+specified, SavedModel CLI will check that only one file is included in the zip
+file and load it for the specified input tensor key.
+
+When loading from a pickle file, if no `variable_name` is specified in the
+square brackets, whatever that is inside the pickle file will be passed to the
+specified input tensor key. Else SavedModel CLI will assume a dictionary is
+stored in the pickle file and the value corresponding to the variable_name will
+be used.
+
+#### Input By Python Expression
+
+To pass in inputs by python expressions, use `--input_exprs` option. `numpy`
+module is available as `np`. For example, `input_key=np.ones((32, 32, 3))` or
+`input_key=[[1], [2], [3]]`. This can be useful for when you don't have data
+files lying around, but still want to sanity check the model with some simple
+inputs that match the dtype and shape of the model signature.
+
+#### Save Output
+
+By default, SavedModel CLI will print outputs to console. If a directory is
+passed to `--outdir` option, the outputs will be saved as npy files named after
+output tensor keys under the given directory. Use `--overwite` to overwrite
+existing output files.
+
+#### TensorFlow Debugger (tfdbg) Integration
+
+If `--tf_debug` option is set, SavedModel CLI will use TensorFlow Debugger
+(tfdbg) to watch the intermediate Tensors and runtime GraphDefs while running
+the SavedModel.
+
+#### Examples
+
+If we have a simple model that adds `x1` and `x2` to get output `y`, where all
+tensors are of shape `(-1, 1)`, and we have two `npz` files. File
+`/tmp/my_data1.npy` contains a numpy ndarray `[[1], [2], [3]]`, file
+`/tmp/my_data2.npy` contains another numpy ndarray `[[0.5], [0.5], [0.5]]`. Now
+let's run these two `npy` files through the model to get `y`:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def x1_x2_to_y --inputs x1=/tmp/my_data1.npy;x2=/tmp/my_data2.npy \
+--outdir /tmp/out
+Result for output key y:
+[[ 1.5]
+ [ 2.5]
+ [ 3.5]]
+```
+
+Similarly, we can run input tensors from `npz` file and pickle file, as well as
+overwrite the previous output file:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def x1_x2_to_y \
+--inputs x1=/tmp/my_data1.npz[x];x2=/tmp/my_data2.pkl --outdir /tmp/out \
+--overwrite
+Result for output key y:
+[[ 1.5]
+ [ 2.5]
+ [ 3.5]]
+```
+
+You can also use python expression instead of input file. Here we replace input
+`x2` with a python expression:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def x1_x2_to_y --inputs x1=/tmp/my_data1.npz[x] \
+--input_exprs 'x2=np.ones((3,1))'
+Result for output key y:
+[[ 2]
+ [ 3]
+ [ 4]]
+```
+
+To run model with TensorFlow Debugger on:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def serving_default --inputs x=/tmp/data.npz[x] --tf_debug
+```
diff --git a/tensorflow/docs_src/programmers_guide/supervisor.md b/tensorflow/docs_src/programmers_guide/supervisor.md
index 82ed1c2cf76679f2b3cc86807fba882941722e6e..55a090df5898673cec7812021b1feea9606d6376 100644
--- a/tensorflow/docs_src/programmers_guide/supervisor.md
+++ b/tensorflow/docs_src/programmers_guide/supervisor.md
@@ -362,8 +362,8 @@ following keyword arguments to the `Supervisor()` constructor:
    If not specified, the supervisor uses the first op in the
    `tf.GraphKeys.LOCAL_INIT_OP` collection.  If the collection is empty the
    supervisor adds an op to initialize all the tables and local variables in
-   the graph by calling `tf.initialize_all_tables()` and
-   `tf.initialize_all_local_variables()`.
+   the graph by calling `tf.tables_initializer()` and
+   `tf.local_variables_initializer()`.
 
    Pass `None` to not use a local init op.
 
diff --git a/tensorflow/docs_src/programmers_guide/threading_and_queues.md b/tensorflow/docs_src/programmers_guide/threading_and_queues.md
index 1999cf6941095b5cd658c49965dc384e7d58ff6b..835e8060466428ddbb82f9ef6d1b78c76a0c9890 100644
--- a/tensorflow/docs_src/programmers_guide/threading_and_queues.md
+++ b/tensorflow/docs_src/programmers_guide/threading_and_queues.md
@@ -14,7 +14,7 @@ that takes an item off the queue, adds one to that item, and puts it back on the
 end of the queue. Slowly, the numbers on the queue increase.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/IncremeterFifoQueue.gif">
+<img style="width:100%" src="https://www.tensorflow.org/images/IncremeterFifoQueue.gif">
 </div>
 
 `Enqueue`, `EnqueueMany`, and `Dequeue` are special nodes. They take a pointer
diff --git a/tensorflow/docs_src/programmers_guide/variable_scope.md b/tensorflow/docs_src/programmers_guide/variable_scope.md
index 5084acbab97b0d87ca3232b44b0e07e92e8a6ff4..f4d2b3f37b875f589e2de69d8681a09e90f99360 100644
--- a/tensorflow/docs_src/programmers_guide/variable_scope.md
+++ b/tensorflow/docs_src/programmers_guide/variable_scope.md
@@ -5,7 +5,7 @@ in the way described in the @{$variables$Variables HowTo}.
 But when building complex models you often need to share large sets of
 variables and you might want to initialize all of them in one place.
 This tutorial shows how this can be done using `tf.variable_scope()` and
-the `tf.get_variable()`.
+`tf.get_variable()`.
 
 ## The Problem
 
@@ -368,6 +368,6 @@ sequence-to-sequence models.
 
 File | What's in it?
 --- | ---
-`models/tutorials/image/cifar10/cifar10.py` | Model for detecting objects in images.
-`models/tutorials/rnn/rnn_cell.py` | Cell functions for recurrent neural networks.
-`models/tutorials/rnn/seq2seq.py` | Functions for building sequence-to-sequence models.
+`tutorials/image/cifar10/cifar10.py` | Model for detecting objects in images.
+`tutorials/rnn/rnn_cell.py` | Cell functions for recurrent neural networks.
+`tutorials/rnn/seq2seq.py` | Functions for building sequence-to-sequence models.
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index 04bfca5f3bdc8dad9e4dedf3ce8691ae01eb1f44..e8d1e519f0b8fd05039b107a5501ea0da7cc29a6 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -62,7 +62,7 @@ with tf.device("/job:ps/task:7"):
   v = tf.Variable(...)
 ```
 
-**N.B.** Operations that mutate a variable, such as
+**NOTE** Operations that mutate a variable, such as
 @{tf.Variable.assign} and the parameter
 update operations in a
 @{tf.train.Optimizer} *must* run on
diff --git a/tensorflow/docs_src/programmers_guide/version_semantics.md b/tensorflow/docs_src/programmers_guide/version_semantics.md
index 7903053d5e38e3072949d51ab0dbdc99ff293f68..47fc582387a2850fd24dc220fdca88806073be84 100644
--- a/tensorflow/docs_src/programmers_guide/version_semantics.md
+++ b/tensorflow/docs_src/programmers_guide/version_semantics.md
@@ -102,6 +102,13 @@ backward incompatible ways between minor releases. These include:
     optimizer. This may break code relying on the wrong behavior for
     convergence. We will note such changes in the release notes.
 
+*   **Error messages:** We reserve the right to change the text of error
+    messages. In addition, the type of an error may change unless the type is
+    specified in the documentation. For example, a function that says in some
+    condition it will raise an `InvalidArgument` exception, it will continue to
+    raise `InvalidArgument`, but the human-readable message contents can change.
+
+
 Furthermore, any API methods marked "deprecated" in the 1.0 release can
 be deleted in any subsequent minor release.
 
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index ba3fbe12804630dc88c3da2f787f9d29b102ed94..f60c8fd7701edb40f97efdc6dc0f970b6aec3cda 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -83,21 +83,21 @@ for details.  It consists of 1,068,298 learnable parameters and requires about
 ## Code Organization
 
 The code for this tutorial resides in
-[`tensorflow_models/tutorials/image/cifar10/`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/).
+[`models/tutorials/image/cifar10/`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/).
 
 File | Purpose
 --- | ---
-[`cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
-[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
-[`cifar10_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
-[`cifar10_multi_gpu_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
-[`cifar10_eval.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
+[`cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
+[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
+[`cifar10_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
+[`cifar10_multi_gpu_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
+[`cifar10_eval.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
 
 
 ## CIFAR-10 Model
 
 The CIFAR-10 network is largely contained in
-[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py).
+[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py).
 The complete training
 graph contains roughly 765 operations. We find that we can make the code most
 reusable by constructing the graph with the following modules:
@@ -141,7 +141,7 @@ so that we may visualize them in @{$summaries_and_tensorboard$TensorBoard}.
 This is a good practice to verify that inputs are built correctly.
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:70%" src="../images/cifar_image_summary.png">
+  <img style="width:70%" src="https://www.tensorflow.org/images/cifar_image_summary.png">
 </div>
 
 Reading images from disk and distorting them can use a non-trivial amount of
@@ -170,7 +170,7 @@ Layer Name | Description
 Here is a graph generated from TensorBoard describing the inference operation:
 
 <div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/cifar_graph.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/cifar_graph.png">
 </div>
 
 > **EXERCISE**: The output of `inference` are un-normalized logits. Try editing
@@ -205,7 +205,7 @@ loss and all these weight decay terms, as returned by the `loss()` function.
 
 We visualize it in TensorBoard with a @{tf.summary.scalar}:
 
-![CIFAR-10 Loss](../images/cifar_loss.png "CIFAR-10 Total Loss")
+![CIFAR-10 Loss](https://www.tensorflow.org/images/cifar_loss.png "CIFAR-10 Total Loss")
 
 We train the model using standard
 [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent)
@@ -214,7 +214,7 @@ with a learning rate that
 @{tf.train.exponential_decay$exponentially decays}
 over time.
 
-![CIFAR-10 Learning Rate Decay](../images/cifar_lr_decay.png "CIFAR-10 Learning Rate Decay")
+![CIFAR-10 Learning Rate Decay](https://www.tensorflow.org/images/cifar_lr_decay.png "CIFAR-10 Learning Rate Decay")
 
 The `train()` function adds the operations needed to minimize the objective by
 calculating the gradient and updating the learned variables (see
@@ -295,8 +295,8 @@ For instance, we can watch how the distribution of activations and degree of
 sparsity in `local3` features evolve during training:
 
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px; display: flex; flex-direction: row">
-  <img style="flex-grow:1; flex-shrink:1;" src="../images/cifar_sparsity.png">
-  <img style="flex-grow:1; flex-shrink:1;" src="../images/cifar_activations.png">
+  <img style="flex-grow:1; flex-shrink:1;" src="https://www.tensorflow.org/images/cifar_sparsity.png">
+  <img style="flex-grow:1; flex-shrink:1;" src="https://www.tensorflow.org/images/cifar_activations.png">
 </div>
 
 Individual loss functions, as well as the total loss, are particularly
@@ -378,7 +378,7 @@ processing a batch of data.
 Here is a diagram of this model:
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/Parallelism.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/Parallelism.png">
 </div>
 
 Note that each GPU computes inference as well as the gradients for a unique
diff --git a/tensorflow/docs_src/tutorials/image_recognition.md b/tensorflow/docs_src/tutorials/image_recognition.md
index bf03427fc5b011c96cabeff3a0b5c081e1b9309c..88ae451cd5365d878d8b4fea83cebe4c6ff57c91 100644
--- a/tensorflow/docs_src/tutorials/image_recognition.md
+++ b/tensorflow/docs_src/tutorials/image_recognition.md
@@ -36,7 +36,7 @@ images into [1000 classes], like "Zebra", "Dalmatian", and "Dishwasher".
 For example, here are the results from [AlexNet] classifying some images:
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/AlexClassification.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/AlexClassification.png">
 </div>
 
 To compare models, we examine how often the model fails to predict the
@@ -75,7 +75,7 @@ Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/mode
 The above command will classify a supplied image of a panda bear.
 
 <div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/cropped_panda.jpg">
+  <img style="width:100%" src="https://www.tensorflow.org/images/cropped_panda.jpg">
 </div>
 
 If the model runs correctly, the script will produce the following output:
@@ -137,7 +137,7 @@ score of 0.8.
 
 
 <div style="width:45%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/grace_hopper.jpg">
+  <img style="width:100%" src="https://www.tensorflow.org/images/grace_hopper.jpg">
 </div>
 
 Next, try it out on your own images by supplying the --image= argument, e.g.
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index c42bb8a023eaac7fdb5ae17c6940e760d8cbee51..a65b5845cf52a9e723dec004cac8e95508d271f8 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -18,7 +18,7 @@ to help control the training process.
 
 ## Training on Flowers
 
-![Daisies by Kelly Sikkema](../images/daisies.jpg)
+![Daisies by Kelly Sikkema](https://www.tensorflow.org/images/daisies.jpg)
 [Image by Kelly Sikkema](https://www.flickr.com/photos/95072945@N05/9922116524/)
 
 Before you start any training, you'll need a set of images to teach the network
@@ -174,7 +174,7 @@ you do that and pass the root folder of the subdirectories as the argument to
 Here's what the folder structure of the flowers archive looks like, to give you
 and example of the kind of layout the script is looking for:
 
-![Folder Structure](../images/folder_structure.png)
+![Folder Structure](https://www.tensorflow.org/images/folder_structure.png)
 
 In practice it may take some work to get the accuracy you want. I'll try to
 guide you through some of the common problems you might encounter below.
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index 2550bd3e4287a9abb4ac25b11fb4ca779e233875..aa8e2cc83991cb15e593198a45b34406f4230c9b 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -7,7 +7,7 @@ activation functions, and applying dropout regularization. In this tutorial,
 you'll learn how to use `layers` to build a convolutional neural network model
 to recognize the handwritten digits in the MNIST data set.
 
-![handwritten digits 0–9 from the MNIST data set](../images/mnist_0-9.png)
+![handwritten digits 0–9 from the MNIST data set](https://www.tensorflow.org/images/mnist_0-9.png)
 
 **The [MNIST dataset](http://yann.lecun.com/exdb/mnist/) comprises 60,000
 training examples and 10,000 test examples of the handwritten digits 0–9,
@@ -316,11 +316,11 @@ of 2, which indicates that the subregions extracted by the filter should be
 separated by 2 pixels in both the width and height dimensions (for a 2x2 filter,
 this means that none of the regions extracted will overlap). If you want to set
 different stride values for width and height, you can instead specify a tuple or
-list (e.g., `stride=[3,6]`).
+list (e.g., `stride=[3, 6]`).
 
 Our output tensor produced by `max_pooling2d()` (`pool1`) has a shape of
-<code>[<em>batch_size</em>, 14, 14, 1]</code>: the 2x2 filter reduces width and
-height by 50%.
+<code>[<em>batch_size</em>, 14, 14, 32]</code>: the 2x2 filter reduces width and
+height by 50% each.
 
 ### Convolutional Layer #2 and Pooling Layer #2
 
diff --git a/tensorflow/docs_src/tutorials/mandelbrot.md b/tensorflow/docs_src/tutorials/mandelbrot.md
index 7d8abbdcba67fd86c7e26e735cfa30bb637989c6..1c0a548129c22f2c57107061bd7eda6239eabdb8 100755
--- a/tensorflow/docs_src/tutorials/mandelbrot.md
+++ b/tensorflow/docs_src/tutorials/mandelbrot.md
@@ -109,7 +109,7 @@ Let's see what we've got.
 DisplayFractal(ns.eval())
 ```
 
-![jpeg](../images/mandelbrot_output.jpg)
+![jpeg](https://www.tensorflow.org/images/mandelbrot_output.jpg)
 
 Not bad!
 
diff --git a/tensorflow/docs_src/tutorials/pdes.md b/tensorflow/docs_src/tutorials/pdes.md
index ec6915074ba7392967a8a72a67d7c54ff8d981ae..425e8d7084e7f2505b7a3013b431345b72b38cf0 100755
--- a/tensorflow/docs_src/tutorials/pdes.md
+++ b/tensorflow/docs_src/tutorials/pdes.md
@@ -93,7 +93,7 @@ for n in range(40):
 DisplayArray(u_init, rng=[-0.1, 0.1])
 ```
 
-![jpeg](../images/pde_output_1.jpg)
+![jpeg](https://www.tensorflow.org/images/pde_output_1.jpg)
 
 
 Now let's specify the details of the differential equation.
diff --git a/tensorflow/docs_src/tutorials/seq2seq.md b/tensorflow/docs_src/tutorials/seq2seq.md
index a3db3e51cfd1cdba489c14d9c1b50f1512ce240f..6ffe3e8b037a8e21b38cded7e3b0d617b4ddb212 100644
--- a/tensorflow/docs_src/tutorials/seq2seq.md
+++ b/tensorflow/docs_src/tutorials/seq2seq.md
@@ -40,7 +40,7 @@ networks (RNNs): an *encoder* that processes the input and a *decoder* that
 generates the output. This basic architecture is depicted below.
 
 <div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/basic_seq2seq.png" />
+<img style="width:100%" src="https://www.tensorflow.org/images/basic_seq2seq.png" />
 </div>
 
 Each box in the picture above represents a cell of the RNN, most commonly
@@ -62,7 +62,7 @@ decoding step. A multi-layer sequence-to-sequence network with LSTM cells and
 attention mechanism in the decoder looks like this.
 
 <div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/attention_seq2seq.png" />
+<img style="width:100%" src="https://www.tensorflow.org/images/attention_seq2seq.png" />
 </div>
 
 ## TensorFlow seq2seq library
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index e530e6e1ac9347a5cd49ffadabd7851912a38df5..ce820099037757d79b79fe89c8c0d1ef200400e7 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -17,22 +17,19 @@ To try the code for this tutorial:
 2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
 3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
-    1. Get `pip`:
 
-       ```shell
-       # Ubuntu/Linux 64-bit
-       $ sudo apt-get install python-pip python-dev
+    a. Get `pip`:
 
-       # Mac OS X
-       $ sudo easy_install pip
-       $ sudo easy_install --upgrade six
-       ```
+        # Ubuntu/Linux 64-bit
+        $ sudo apt-get install python-pip python-dev
 
-    2. Use `pip` to install pandas:
+        # Mac OS X
+        $ sudo easy_install pip
+        $ sudo easy_install --upgrade six
 
-       ```shell
-       $ sudo pip install pandas
-       ```
+    b. Use `pip` to install pandas:
+
+        $ sudo pip install pandas
 
     If you have trouble installing pandas, consult the
     [instructions](http://pandas.pydata.org/pandas-docs/stable/install.html)
@@ -41,9 +38,7 @@ To try the code for this tutorial:
 4. Execute the tutorial code with the following command to train the linear
 model described in this tutorial:
 
-   ```shell
-   $ python wide_n_deep_tutorial.py --model_type=wide
-   ```
+        $ python wide_n_deep_tutorial.py --model_type=wide
 
 Read on to find out how this code builds its linear model.
 
diff --git a/tensorflow/docs_src/tutorials/wide_and_deep.md b/tensorflow/docs_src/tutorials/wide_and_deep.md
index bae934b3f4c1d578fcb3f2dd8a0308634772da69..77c905fd51369ec63bb47b3cdb7dc58f862a6410 100644
--- a/tensorflow/docs_src/tutorials/wide_and_deep.md
+++ b/tensorflow/docs_src/tutorials/wide_and_deep.md
@@ -17,8 +17,7 @@ large-scale regression and classification problems with sparse input features
 you're interested in learning more about how Wide & Deep Learning works, please
 check out our [research paper](http://arxiv.org/abs/1606.07792).
 
-![Wide & Deep Spectrum of Models]
-(../images/wide_n_deep.svg "Wide & Deep")
+![Wide & Deep Spectrum of Models](https://www.tensorflow.org/images/wide_n_deep.svg "Wide & Deep")
 
 The figure above shows a comparison of a wide model (logistic regression with
 sparse features and transformations), a deep model (feed-forward neural network
@@ -44,22 +43,19 @@ To try the code for this tutorial:
 2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
 3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
-    1. Get `pip`:
 
-       ```bsh
-       # Ubuntu/Linux 64-bit
-       $ sudo apt-get install python-pip python-dev
+    a. Get `pip`:
 
-       # Mac OS X
-       $ sudo easy_install pip
-       $ sudo easy_install --upgrade six
-      ```
+        # Ubuntu/Linux 64-bit
+        $ sudo apt-get install python-pip python-dev
 
-    2. Use `pip` to install pandas:
+        # Mac OS X
+        $ sudo easy_install pip
+        $ sudo easy_install --upgrade six
 
-       ```bsh
-       $ sudo pip install pandas
-       ```
+    b. Use `pip` to install pandas:
+
+        $ sudo pip install pandas
 
     If you have trouble installing pandas, consult the
     [instructions](http://pandas.pydata.org/pandas-docs/stable/install.html)
@@ -68,9 +64,7 @@ To try the code for this tutorial:
 4. Execute the tutorial code with the following command to train the linear
 model described in this tutorial:
 
-   ```shell
-   $ python wide_n_deep_tutorial.py --model_type=wide_n_deep
-   ```
+        $ python wide_n_deep_tutorial.py --model_type=wide_n_deep
 
 Read on to find out how this code builds its linear model.
 
diff --git a/tensorflow/docs_src/tutorials/word2vec.md b/tensorflow/docs_src/tutorials/word2vec.md
index 3845e67496c0cff76515faf6d04f808c62dc042f..348e069ed6d1c9cf7d6b020f1f9f95be6871b3d7 100644
--- a/tensorflow/docs_src/tutorials/word2vec.md
+++ b/tensorflow/docs_src/tutorials/word2vec.md
@@ -23,7 +23,7 @@ straight in, feel free to look at the minimalistic implementation in
 This basic example contains the code needed to download some data, train on it a
 bit and visualize the result. Once you get comfortable with reading and running
 the basic version, you can graduate to
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py)
+[tensorflow_models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py)
 which is a more serious implementation that showcases some more advanced
 TensorFlow principles about how to efficiently use threads to move data into a
 text model, how to checkpoint during training, etc.
@@ -51,7 +51,7 @@ means that we may need more data in order to successfully train statistical
 models.  Using vector representations can overcome some of these obstacles.
 
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/audio-image-text.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/audio-image-text.png" alt>
 </div>
 
 [Vector space models](https://en.wikipedia.org/wiki/Vector_space_model) (VSMs)
@@ -125,7 +125,7 @@ probability using the score for all other \\(V\\) words \\(w'\\) in the current
 context \\(h\\), *at every training step*.
 
 <div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/softmax-nplm.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-nplm.png" alt>
 </div>
 
 On the other hand, for feature learning in word2vec we do not need a full
@@ -136,7 +136,7 @@ same context. We illustrate this below for a CBOW model. For skip-gram the
 direction is simply inverted.
 
 <div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/nce-nplm.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/nce-nplm.png" alt>
 </div>
 
 Mathematically, the objective (for each example) is to maximize
@@ -233,7 +233,7 @@ below (see also for example
 [Mikolov et al., 2013](http://www.aclweb.org/anthology/N13-1090)).
 
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/linear-relationships.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/linear-relationships.png" alt>
 </div>
 
 This explains why these vectors are also useful as features for many canonical
@@ -335,13 +335,13 @@ After training has finished we can visualize the learned embeddings using
 t-SNE.
 
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/tsne.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/tsne.png" alt>
 </div>
 
 Et voila! As expected, words that are similar end up clustering nearby each
 other. For a more heavyweight implementation of word2vec that showcases more of
 the advanced features of TensorFlow, see the implementation in
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
+[tensorflow_models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
 
 ## Evaluating Embeddings: Analogical Reasoning
 
@@ -357,7 +357,7 @@ Download the dataset for this task from
 
 To see how we do this evaluation, have a look at the `build_eval_graph()` and
 `eval()` functions in
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
+[tensorflow_models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
 
 The choice of hyperparameters can strongly influence the accuracy on this task.
 To achieve state-of-the-art performance on this task requires training over a
@@ -385,13 +385,13 @@ your model is seriously bottlenecked on input data, you may want to implement a
 custom data reader for your problem, as described in
 @{$new_data_formats$New Data Formats}.  For the case of Skip-Gram
 modeling, we've actually already done this for you as an example in
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
+[tensorflow_models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
 
 If your model is no longer I/O bound but you want still more performance, you
 can take things further by writing your own TensorFlow Ops, as described in
 @{$adding_an_op$Adding a New Op}.  Again we've provided an
 example of this for the Skip-Gram case
-[tensorflow_models/tutorials/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec_optimized.py).
+[tensorflow_models/tutorials/embedding/word2vec_optimized.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec_optimized.py).
 Feel free to benchmark these against each other to measure performance
 improvements at each stage.
 
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 0414566b98470cc8b37b46f4bbf090278e13dbc4..08fb20e99536b180132de81b8e216f1d208c2c05 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -28,9 +28,9 @@ on API >= 14 devices.
         using Deep Neural Networks](https://arxiv.org/abs/1312.2249) to
         localize and track people in the camera preview in real-time.
 3. [TF Stylize](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java):
-        Uses a model based on [A Learned Representation For Artistic Style]
-        (https://arxiv.org/abs/1610.07629) to restyle the camera preview image
-        to that of a number of different artists.
+        Uses a model based on [A Learned Representation For Artistic
+        Style](https://arxiv.org/abs/1610.07629) to restyle the camera preview 
+        image to that of a number of different artists.
 
 <img src="sample_images/classify1.jpg" width="30%"><img src="sample_images/stylize1.jpg" width="30%"><img src="sample_images/detect1.jpg" width="30%">
 
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 4f241027f4b6e80089be2bca179a8d4d4f565032..5a173b129be1e5d0598bd48848749ed1ab0446c3 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -7,20 +7,46 @@
 // libraries back into the appropriate directory.
 //
 // Alternatively, experimental support for Makefile builds is provided by
-// setting buildWithMake below to true. This will allow building the demo
+// setting nativeBuildSystem below to 'makefile'. This will allow building the demo
 // on Windows machines, but note that full equivalence with the Bazel
 // build is not yet guaranteed. See comments below for caveats and tips
 // for speeding up the build, such as as enabling ccache.
-
-// Set to true to build with make.
 // NOTE: Running a make build will cause subsequent Bazel builds to *fail*
 // unless the contrib/makefile/downloads/ and gen/ dirs are deleted afterwards.
-def buildWithMake = false
+
+// The cmake build only creates libtensorflow_demo.so. In this situation,
+// libtensorflow_inference.so will be acquired via the tensorflow.aar dependency.
+
+// It is necessary to customize Gradle's build directory, as otherwise
+// it will conflict with the BUILD file used by Bazel on case-insensitive OSs.
+project.buildDir = 'gradleBuild'
+getProject().setBuildDir('gradleBuild')
+
+buildscript {
+    repositories {
+        jcenter()
+    }
+
+    dependencies {
+        classpath 'com.android.tools.build:gradle:2.3.0'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.2'
+    }
+}
+
+allprojects {
+    repositories {
+        jcenter()
+    }
+}
+
+// set to 'bazel', 'cmake', 'makefile', 'none'
+def nativeBuildSystem = 'bazel'
 
 // Controls output directory in APK and CPU type for Bazel builds.
 // NOTE: Does not affect the Makefile build target API (yet), which currently
 // assumes armeabi-v7a. If building with make, changing this will require
 // editing the Makefile as well.
+// The CMake build has only been tested with armeabi-v7a; others may not work.
 def cpuType = 'armeabi-v7a'
 
 // Output directory in the local directory for packaging into the APK.
@@ -30,62 +56,66 @@ def nativeOutDir = 'libs/' + cpuType
 def nativeBuildRule = 'buildNativeBazel'
 def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so'
 def inferenceLibPath = '../../../bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so'
-if (buildWithMake) {
+if (nativeBuildSystem == 'makefile') {
     nativeBuildRule = 'buildNativeMake'
     demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_demo.so'
     inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_inference.so'
 }
 
-// Defines the NDK location for Makefile builds. Does *not* affect Bazel builds.
-// Override with your absolute NDK location if this fails to get the location
-// automatically.
-def makeNdkRoot = System.getenv('NDK_ROOT')
-
 // If building with Bazel, this is the location of the bazel binary.
 // NOTE: Bazel does not yet support building for Android on Windows,
 // so in this case the Makefile build must be used as described above.
 def bazelLocation = '/usr/local/bin/bazel'
 
-project.buildDir = 'gradleBuild'
-getProject().setBuildDir('gradleBuild')
-
 // import DownloadModels task
 project.ext.ASSET_DIR = projectDir.toString() + '/assets'
 project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
 
-buildscript {
-    repositories {
-        jcenter()
-    }
-
-    dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.0'
-    }
-}
-
 apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 23
     buildToolsVersion "25.0.2"
 
+    if (nativeBuildSystem == 'cmake') {
+        defaultConfig {
+            applicationId = 'com.tensorflow.demo'
+            minSdkVersion 21
+            targetSdkVersion 23
+            ndk {
+                abiFilters "${cpuType}"
+            }
+            externalNativeBuild {
+                cmake {
+                    arguments '-DANDROID_TOOLCHAIN=gcc', '-DANDROID_STL=gnustl_static'
+                }
+            }
+        }
+        externalNativeBuild {
+            cmake {
+                path './jni/CMakeLists.txt'
+            }
+        }
+    }
+
     lintOptions {
         abortOnError false
     }
 
     sourceSets {
         main {
-            // TensorFlow Java API sources.
-            java {
-                srcDir '../../java/src/main/java'
-                exclude '**/examples/**'
+            if (nativeBuildSystem == 'bazel' || nativeBuildSystem == 'makefile') {
+                // TensorFlow Java API sources.
+                java {
+                    srcDir '../../java/src/main/java'
+                    exclude '**/examples/**'
+                }
+
+                // Android TensorFlow wrappers, etc.
+                java {
+                    srcDir '../../contrib/android/java'
+                }
             }
-
-            // Android TensorFlow wrappers, etc.
-            java {
-                srcDir '../../contrib/android/java'
-            }
-
             // Android demo app sources.
             java {
                 srcDir 'src'
@@ -115,7 +145,7 @@ task buildNativeBazel(type: Exec) {
 }
 
 task buildNativeMake(type: Exec) {
-    environment "NDK_ROOT", makeNdkRoot
+    environment "NDK_ROOT", android.ndkDirectory
     // Tip: install ccache and uncomment the following to speed up
     // builds significantly.
     // environment "CC_PREFIX", 'ccache'
@@ -138,13 +168,14 @@ task copyNativeLibs(type: Copy) {
     fileMode 0644
 }
 
-
 tasks.whenTaskAdded { task ->
-    if (task.name == 'assembleDebug') {
-        task.dependsOn 'copyNativeLibs'
-    }
-    if (task.name == 'assembleRelease') {
-        task.dependsOn 'copyNativeLibs'
+    if (nativeBuildSystem == 'bazel' || nativeBuildSystem == 'makefile') {
+        if (task.name == 'assembleDebug') {
+            task.dependsOn 'copyNativeLibs'
+        }
+        if (task.name == 'assembleRelease') {
+            task.dependsOn 'copyNativeLibs'
+        }
     }
 }
 
@@ -152,3 +183,9 @@ tasks.whenTaskAdded { task ->
 // place them in the "assets" directory and comment out this line.
 apply from: "download-models.gradle"
 
+
+dependencies {
+    if (nativeBuildSystem == 'cmake' || nativeBuildSystem == 'none') {
+        compile 'org.tensorflow:tensorflow-android:1.2.0-preview'
+    }
+}
diff --git a/tensorflow/examples/android/jni/CMakeLists.txt b/tensorflow/examples/android/jni/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c8238464f579ea4ecaed6946a8554674325cd32f
--- /dev/null
+++ b/tensorflow/examples/android/jni/CMakeLists.txt
@@ -0,0 +1,51 @@
+#
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+project(TENSORFLOW_DEMO)
+cmake_minimum_required(VERSION 3.4.1)
+
+set(CMAKE_VERBOSE_MAKEFILE on)
+
+get_filename_component(TF_SRC_ROOT ${CMAKE_SOURCE_DIR}/../../../..  ABSOLUTE)
+get_filename_component(SAMPLE_SRC_DIR  ${CMAKE_SOURCE_DIR}/..  ABSOLUTE)
+
+if (ANDROID_ABI MATCHES "^armeabi-v7a$")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon")
+elseif(ANDROID_ABI MATCHES "^arm64-v8a")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -ftree-vectorize")
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSTANDALONE_DEMO_LIB \
+                    -std=c++11 -fno-exceptions -fno-rtti -O2 -Wno-narrowing \
+                    -fPIE")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} \
+                              -Wl,--allow-multiple-definition \
+                              -Wl,--whole-archive -fPIE -v")
+
+file(GLOB_RECURSE tensorflow_demo_sources ${SAMPLE_SRC_DIR}/jni/*.*)
+add_library(tensorflow_demo SHARED
+            ${tensorflow_demo_sources})
+target_include_directories(tensorflow_demo PRIVATE
+                           ${TF_SRC_ROOT}
+                           ${CMAKE_SOURCE_DIR})
+
+target_link_libraries(tensorflow_demo
+                      android
+                      log
+                      jnigraphics
+                      m
+                      atomic
+                      z)
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
index b26a2316782dfbcde73c75556b99e624e836549d..bc391269255f64cb17bdc3f7ff65f801b0c60e67 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -194,13 +194,12 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
           yuvBytes[0],
           yuvBytes[1],
           yuvBytes[2],
-          rgbBytes,
           previewWidth,
           previewHeight,
           yRowStride,
           uvRowStride,
           uvPixelStride,
-          false);
+          rgbBytes);
 
       image.close();
     } catch (final Exception e) {
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
index 206a99f3e3d5bde2f09f6fc5d5ebded97f787f0a..5800f80651bdbd07b3a861299421501cf47b1716 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -124,7 +124,7 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
     borderedText = new BorderedText(textSizePx);
     borderedText.setTypeface(Typeface.MONOSPACE);
 
-    tracker = new MultiBoxTracker(getResources().getDisplayMetrics());
+    tracker = new MultiBoxTracker(this);
 
     if (USE_YOLO) {
       detector =
@@ -273,13 +273,12 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
           yuvBytes[0],
           yuvBytes[1],
           yuvBytes[2],
-          rgbBytes,
           previewWidth,
           previewHeight,
           yRowStride,
           uvRowStride,
           uvPixelStride,
-          false);
+          rgbBytes);
 
       image.close();
     } catch (final Exception e) {
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
index 7634be5c020d93225f29308b11358f5b84f8ee74..7afe2bf5412694c94a0e5b6d575e0a73e42dcb72 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
@@ -65,10 +65,6 @@ import org.tensorflow.demo.R;
  * Artistic Style" (https://arxiv.org/abs/1610.07629)
  */
 public class StylizeActivity extends CameraActivity implements OnImageAvailableListener {
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   private static final Logger LOGGER = new Logger();
 
   private static final String MODEL_FILE = "file:///android_asset/stylize_quantized.pb";
@@ -509,17 +505,17 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
       final int yRowStride = planes[0].getRowStride();
       final int uvRowStride = planes[1].getRowStride();
       final int uvPixelStride = planes[1].getPixelStride();
+
       ImageUtils.convertYUV420ToARGB8888(
           yuvBytes[0],
           yuvBytes[1],
           yuvBytes[2],
-          rgbBytes,
           previewWidth,
           previewHeight,
           yRowStride,
           uvRowStride,
           uvPixelStride,
-          false);
+          rgbBytes);
 
       image.close();
     } catch (final Exception e) {
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
index f660178ebeb1993a86879c309c50acd60dc2b2a4..5756bd6b64f47018e53081c83fb5c62004f87474 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
@@ -32,10 +32,6 @@ import org.tensorflow.contrib.android.TensorFlowInferenceInterface;
 
 /** A classifier specialized to label images using TensorFlow. */
 public class TensorFlowImageClassifier implements Classifier {
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   private static final String TAG = "TensorFlowImageClassifier";
 
   // Only return this many results with at least this confidence.
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
index f3e7114335ff2367e3b1e4ae58073145710c8fea..1dcf9f55efe810345e1e8280dd8f22098c61a7b3 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
@@ -41,10 +41,6 @@ import org.tensorflow.demo.env.Logger;
 public class TensorFlowMultiBoxDetector implements Classifier {
   private static final Logger LOGGER = new Logger();
 
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   // Only return this many results with at least this confidence.
   private static final int MAX_RESULTS = Integer.MAX_VALUE;
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
index 174723071da9979de6c7d8b004ffa64689af471b..b7e36a2379d264403f4894537ee4a810cbd3f78b 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
@@ -31,10 +31,6 @@ import org.tensorflow.demo.env.SplitTimer;
 public class TensorFlowYoloDetector implements Classifier {
   private static final Logger LOGGER = new Logger();
 
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   // Only return this many results with at least this confidence.
   private static final int MAX_RESULTS = 5;
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
index db929e5e087545b0ea190f80e945376c41a4b37e..5f2ff9164cc7ad4055359e16fd5dfdd4a67786a2 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
@@ -27,6 +27,14 @@ import java.io.FileOutputStream;
 public class ImageUtils {
   @SuppressWarnings("unused")
   private static final Logger LOGGER = new Logger();
+  
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.w("Native library not found, native RGB -> YUV conversion may be unavailable.");
+    }
+  }
 
   /**
    * Utility method to compute the allocated size in bytes of a YUV420SP image
@@ -83,10 +91,84 @@ public class ImageUtils {
     }
   }
 
+  // This value is 2 ^ 18 - 1, and is used to clamp the RGB values before their ranges
+  // are normalized to eight bits.
+  static final int kMaxChannelValue = 262143;
+
+  // Always prefer the native implementation if available.
+  private static boolean useNativeConversion = true;
+
+  public static void convertYUV420ToARGB8888(
+      byte[] yData,
+      byte[] uData,
+      byte[] vData,
+      int width,
+      int height,
+      int yRowStride,
+      int uvRowStride,
+      int uvPixelStride,
+      int[] out) {
+    if (useNativeConversion) {
+      try {
+        convertYUV420ToARGB8888(
+            yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false);
+        return;
+      } catch (UnsatisfiedLinkError e) {
+        LOGGER.w("Native YUV -> RGB implementation not found, falling back to Java implementation");
+        useNativeConversion = false;
+      }
+    }
+
+    int i = 0;
+    for (int y = 0; y < height; y++) {
+      int pY = yRowStride * y;
+      int uv_row_start = uvRowStride * (y >> 1);
+      int pUV = uv_row_start;
+      int pV = uv_row_start;
+
+      for (int x = 0; x < width; x++) {
+        int uv_offset = pUV + (x >> 1) * uvPixelStride;
+        out[i++] =
+            YUV2RGB(
+                convertByteToInt(yData, pY + x),
+                convertByteToInt(uData, uv_offset),
+                convertByteToInt(vData, uv_offset));
+      }
+    }
+  }
+
+  private static int convertByteToInt(byte[] arr, int pos) {
+    return arr[pos] & 0xFF;
+  }
+
+  private static int YUV2RGB(int nY, int nU, int nV) {
+    nY -= 16;
+    nU -= 128;
+    nV -= 128;
+    if (nY < 0) nY = 0;
+
+    // This is the floating point equivalent. We do the conversion in integer
+    // because some Android devices do not have floating point in hardware.
+    // nR = (int)(1.164 * nY + 2.018 * nU);
+    // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
+    // nB = (int)(1.164 * nY + 1.596 * nV);
+
+    final int foo = 1192 * nY;
+    int nR = foo + 1634 * nV;
+    int nG = foo - 833 * nV - 400 * nU;
+    int nB = foo + 2066 * nU;
+
+    nR = Math.min(kMaxChannelValue, Math.max(0, nR));
+    nG = Math.min(kMaxChannelValue, Math.max(0, nG));
+    nB = Math.min(kMaxChannelValue, Math.max(0, nB));
+
+    return 0xff000000 | ((nR << 6) & 0x00ff0000) | ((nG >> 2) & 0x0000FF00) | ((nB >> 10) & 0xff);
+  }
+
   /**
-   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width
-   * and height. The input and output must already be allocated and non-null.
-   * For efficiency, no error checking is performed.
+   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width and height. The
+   * input and output must already be allocated and non-null. For efficiency, no error checking is
+   * performed.
    *
    * @param input The array of YUV 4:2:0 input data.
    * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index 49c91d600da4df62a69bfb88ed0b9cb21584fb55..91d1f9feb184f2b145089ed8a410561842b93906 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -15,6 +15,7 @@ limitations under the License.
 
 package org.tensorflow.demo.tracking;
 
+import android.content.Context;
 import android.graphics.Canvas;
 import android.graphics.Color;
 import android.graphics.Matrix;
@@ -24,9 +25,9 @@ import android.graphics.Paint.Join;
 import android.graphics.Paint.Style;
 import android.graphics.RectF;
 import android.text.TextUtils;
-import android.util.DisplayMetrics;
 import android.util.Pair;
 import android.util.TypedValue;
+import android.widget.Toast;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Queue;
@@ -69,6 +70,7 @@ public class MultiBoxTracker {
 
   private static class TrackedRecognition {
     ObjectTracker.TrackedObject trackedObject;
+    RectF location;
     float detectionConfidence;
     int color;
     String title;
@@ -87,8 +89,10 @@ public class MultiBoxTracker {
   private int frameHeight;
 
   private int sensorOrientation;
+  private Context context;
 
-  public MultiBoxTracker(final DisplayMetrics metrics) {
+  public MultiBoxTracker(final Context context) {
+    this.context = context;
     for (final int color : COLORS) {
       availableColors.add(color);
     }
@@ -100,7 +104,9 @@ public class MultiBoxTracker {
     boxPaint.setStrokeJoin(Join.ROUND);
     boxPaint.setStrokeMiter(100);
 
-    textSizePx = TypedValue.applyDimension(TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, metrics);
+    textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, context.getResources().getDisplayMetrics());
     borderedText = new BorderedText(textSizePx);
   }
 
@@ -152,10 +158,6 @@ public class MultiBoxTracker {
   }
 
   public synchronized void draw(final Canvas canvas) {
-    if (objectTracker == null) {
-      return;
-    }
-
     // TODO(andrewharp): This may not work for non-90 deg rotations.
     final float multiplier =
         Math.min(canvas.getWidth() / (float) frameHeight, canvas.getHeight() / (float) frameWidth);
@@ -168,9 +170,11 @@ public class MultiBoxTracker {
             sensorOrientation,
             false);
     for (final TrackedRecognition recognition : trackedObjects) {
-      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
+      final RectF trackedPos =
+          (objectTracker != null)
+              ? recognition.trackedObject.getTrackedPositionInPreviewFrame()
+              : new RectF(recognition.location);
 
-      final RectF trackedPos = trackedObject.getTrackedPositionInPreviewFrame();
       getFrameToCanvasMatrix().mapRect(trackedPos);
       boxPaint.setColor(recognition.color);
 
@@ -185,6 +189,8 @@ public class MultiBoxTracker {
     }
   }
 
+  private boolean initialized = false;
+
   public synchronized void onFrame(
       final int w,
       final int h,
@@ -192,7 +198,7 @@ public class MultiBoxTracker {
       final int sensorOrienation,
       final byte[] frame,
       final long timestamp) {
-    if (objectTracker == null) {
+    if (objectTracker == null && !initialized) {
       ObjectTracker.clearInstance();
 
       logger.i("Initializing ObjectTracker: %dx%d", w, h);
@@ -200,6 +206,19 @@ public class MultiBoxTracker {
       frameWidth = w;
       frameHeight = h;
       this.sensorOrientation = sensorOrienation;
+      initialized = true;
+
+      if (objectTracker == null) {
+        String message =
+            "Object tracking support not found. "
+                + "See tensorflow/examples/android/README.md for details.";
+        Toast.makeText(context, message, Toast.LENGTH_LONG).show();
+        logger.e(message);
+      }
+    }
+
+    if (objectTracker == null) {
+      return;
     }
 
     objectTracker.nextFrame(frame, null, timestamp, null, true);
@@ -255,7 +274,20 @@ public class MultiBoxTracker {
     }
 
     if (objectTracker == null) {
-      logger.w("No ObjectTracker, can't track anything!");
+      trackedObjects.clear();
+      for (final Pair<Float, Recognition> potential : rectsToTrack) {
+        final TrackedRecognition trackedRecognition = new TrackedRecognition();
+        trackedRecognition.detectionConfidence = potential.first;
+        trackedRecognition.location = new RectF(potential.second.getLocation());
+        trackedRecognition.trackedObject = null;
+        trackedRecognition.title = potential.second.getTitle();
+        trackedRecognition.color = COLORS[trackedObjects.size()];
+        trackedObjects.add(trackedRecognition);
+
+        if (trackedObjects.size() >= COLORS.length) {
+          break;
+        }
+      }
       return;
     }
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
index 82de634baff6f9e80cb7aeb45ee98258953321f7..69f202b56816b5db1c3122471798970f32ddb98a 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
@@ -48,7 +48,18 @@ import org.tensorflow.demo.env.Size;
  * ObjectTracker still exists.
  */
 public class ObjectTracker {
-  private final Logger logger = new Logger();
+  private static final Logger LOGGER = new Logger();
+
+  private static boolean libraryFound = false;
+
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+      libraryFound = true;
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.e("libtensorflow_demo.so not found, tracking unavailable");
+    }
+  }
 
   private static final boolean DRAW_TEXT = false;
 
@@ -194,6 +205,13 @@ public class ObjectTracker {
 
   public static synchronized ObjectTracker getInstance(
       final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
+    if (!libraryFound) {
+      LOGGER.e(
+          "Native object tracking support not found. "
+              + "See tensorflow/examples/android/README.md for details.");
+      return null;
+    }
+
     if (instance == null) {
       instance = new ObjectTracker(frameWidth, frameHeight, rowStride, alwaysTrack);
       instance.init();
@@ -519,7 +537,7 @@ public class ObjectTracker {
       checkValidObject();
       synchronized (ObjectTracker.this) {
         if (lastExternalPositionTime > timestamp) {
-          logger.w("Tried to use older position time!");
+          LOGGER.w("Tried to use older position time!");
           return;
         }
         final RectF externalPosition = downscaleRect(position);
@@ -640,8 +658,4 @@ public class ObjectTracker {
 
   protected static native void downsampleImageNative(
       int width, int height, int rowStride, byte[] input, int factor, byte[] output);
-
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
 }
diff --git a/tensorflow/examples/benchmark/BUILD b/tensorflow/examples/benchmark/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c4bb0a5bd952ea175a4fd2444a3d632dc13445de
--- /dev/null
+++ b/tensorflow/examples/benchmark/BUILD
@@ -0,0 +1,31 @@
+# Description:
+# Examples of adding a benchmark to TensorFlow.
+
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "tf_py_logged_benchmark",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_test(
+    name = "sample_benchmark",
+    srcs = ["sample_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+tf_py_logged_benchmark(
+    name = "sample_logged_benchmark",
+    target = "//tensorflow/examples/benchmark:sample_benchmark",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**/*"]),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/examples/benchmark/sample_benchmark.py b/tensorflow/examples/benchmark/sample_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e98d7a2b5f09c08f8796d982e218081ca248de58
--- /dev/null
+++ b/tensorflow/examples/benchmark/sample_benchmark.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sample TensorFlow benchmark."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import tensorflow as tf
+
+
+# Define a class that extends from tf.test.Benchmark.
+class SampleBenchmark(tf.test.Benchmark):
+
+  # Note: benchmark method name must start with `benchmark`.
+  def benchmarkSum(self):
+    with tf.Session() as sess:
+      x = tf.constant(10)
+      y = tf.constant(5)
+      result = tf.add(x, y)
+
+      iters = 100
+      start_time = time.time()
+      for _ in range(iters):
+        sess.run(result)
+      total_wall_time = time.time() - start_time
+
+      # Call report_benchmark to report a metric value.
+      self.report_benchmark(
+          name="sum_wall_time",
+          # This value should always be per iteration.
+          wall_time=total_wall_time/iters,
+          iters=iters)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index a9e73def6a9ca0c86e18a452d4ebdaee82141061..a9ed02dd1a60ad79c2943212155bad864a750a99 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -17,7 +17,8 @@
 
 This version is like fully_connected_feed.py but uses data converted
 to a TFRecords file containing tf.train.Example protocol buffers.
-See tensorflow/g3doc/how_tos/reading_data.md#reading-from-files
+See:
+https://www.tensorflow.org/programmers_guide/reading_data#reading_from_files
 for context.
 
 YOU MUST run convert_to_records before running this (but you only need to
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 5f4b6bed48219b5061e23479b669abcc8106a296..6c1b40b442b0bf877592146c1eda206586dd9e9f 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -369,9 +369,12 @@ def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
   if not gfile.Exists(image_path):
     tf.logging.fatal('File does not exist %s', image_path)
   image_data = gfile.FastGFile(image_path, 'rb').read()
-  bottleneck_values = run_bottleneck_on_image(sess, image_data,
-                                              jpeg_data_tensor,
-                                              bottleneck_tensor)
+  try:
+    bottleneck_values = run_bottleneck_on_image(
+        sess, image_data, jpeg_data_tensor, bottleneck_tensor)
+  except:
+    raise RuntimeError('Error during processing file %s' % image_path)
+
   bottleneck_string = ','.join(str(x) for x in bottleneck_values)
   with open(bottleneck_path, 'w') as bottleneck_file:
     bottleneck_file.write(bottleneck_string)
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 8e3f69a6d629cf42dfab9fda2263877b519dcf9b..d98a5c31ab3adb75f258d902bac6e286941e6bb2 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -30,8 +30,12 @@ limitations under the License.
 // the top of the main() function.
 //
 // The googlenet_graph.pb file included by default is created from Inception.
+//
+// Note that, for GIF inputs, to reuse existing code, only single-frame ones
+// are supported.
 
 #include <fstream>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/cc/ops/const_op.h"
@@ -62,7 +66,7 @@ using tensorflow::int32;
 // Takes a file name, and loads a list of labels from it, one per line, and
 // returns a vector of the strings. It pads with empty strings so the length
 // of the result is a multiple of 16, because our model expects that.
-Status ReadLabelsFile(string file_name, std::vector<string>* result,
+Status ReadLabelsFile(const string& file_name, std::vector<string>* result,
                       size_t* found_label_count) {
   std::ifstream file(file_name);
   if (!file) {
@@ -84,7 +88,7 @@ Status ReadLabelsFile(string file_name, std::vector<string>* result,
 
 // Given an image file name, read in the data, try to decode it as an image,
 // resize it to the requested size, and then scale the values as desired.
-Status ReadTensorFromImageFile(string file_name, const int input_height,
+Status ReadTensorFromImageFile(const string& file_name, const int input_height,
                                const int input_width, const float input_mean,
                                const float input_std,
                                std::vector<Tensor>* out_tensors) {
@@ -102,7 +106,10 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
   } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
-    image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
+    // gif decoder returns 4-D tensor, remove the first dim
+    image_reader = Squeeze(root.WithOpName("squeeze_first_dim"),
+                           DecodeGif(root.WithOpName("gif_reader"),
+                                     file_reader));
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
     image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader,
@@ -138,7 +145,7 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
 
 // Reads a model graph definition from disk, and creates a session object you
 // can use to run it.
-Status LoadGraph(string graph_file_name,
+Status LoadGraph(const string& graph_file_name,
                  std::unique_ptr<tensorflow::Session>* session) {
   tensorflow::GraphDef graph_def;
   Status load_graph_status =
@@ -185,7 +192,7 @@ Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
 // Given the output of a model run, and the name of a file containing the labels
 // this prints out the top five highest-scoring values.
 Status PrintTopLabels(const std::vector<Tensor>& outputs,
-                      string labels_file_name) {
+                      const string& labels_file_name) {
   std::vector<string> labels;
   size_t label_count;
   Status read_labels_status =
@@ -307,11 +314,11 @@ int main(int argc, char* argv[]) {
   }
 
   // This is for automated testing to make sure we get the expected result with
-  // the default settings. We know that label 866 (military uniform) should be
+  // the default settings. We know that label 653 (military uniform) should be
   // the top label for the Admiral Hopper image.
   if (self_test) {
     bool expected_matches;
-    Status check_status = CheckTopLabel(outputs, 866, &expected_matches);
+    Status check_status = CheckTopLabel(outputs, 653, &expected_matches);
     if (!check_status.ok()) {
       LOG(ERROR) << "Running check failed: " << check_status;
       return -1;
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index 0d6875671b1192ed8f09e3672440e83c7092937d..e38704fd98cea6928231f2fc2bc989705ae46bb4 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -159,7 +159,7 @@ Status SaveImage(const Tensor& tensor, const string& file_path) {
 
 // Reads a model graph definition from disk, and creates a session object you
 // can use to run it.
-Status LoadGraph(string graph_file_name,
+Status LoadGraph(const string& graph_file_name,
                  std::unique_ptr<tensorflow::Session>* session) {
   tensorflow::GraphDef graph_def;
   Status load_graph_status =
diff --git a/tensorflow/examples/tutorials/estimators/abalone.py b/tensorflow/examples/tutorials/estimators/abalone.py
index 932ce8a8b25f0b82d61a2ec3e5ea0b980994e1e4..3c0ea2e409076671b282253d22f99516bfa99ffc 100644
--- a/tensorflow/examples/tutorials/estimators/abalone.py
+++ b/tensorflow/examples/tutorials/estimators/abalone.py
@@ -134,12 +134,22 @@ def main(unused_argv):
 
   # Instantiate Estimator
   nn = tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params)
-
+  
+  def get_train_inputs():
+    x = tf.constant(training_set.data)
+    y = tf.constant(training_set.target)
+    return x, y
+  
   # Fit
-  nn.fit(x=training_set.data, y=training_set.target, steps=5000)
+  nn.fit(input_fn=get_train_inputs, steps=5000)
 
   # Score accuracy
-  ev = nn.evaluate(x=test_set.data, y=test_set.target, steps=1)
+  def get_test_inputs():
+    x = tf.constant(test_set.data)
+    y = tf.constant(test_set.target)
+    return x, y
+  
+  ev = nn.evaluate(input_fn=get_test_inputs, steps=1)
   print("Loss: %s" % ev["loss"])
   print("Root Mean Squared Error: %s" % ev["rmse"])
 
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index e32c21ca72056f1150aaa59ff5903d0054f7d14e..a1b4255292b0908fd5f022ce641967ba1b30f75c 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -9,24 +9,22 @@ Construct and execute TensorFlow graphs in Go.
 > (`github.com/tensorflow/tensorflow/tensorflow/go`).
 
 ## Quickstart
-
 1.  Download and extract the TensorFlow C library, preferably into `/usr/local`.
     GPU-enabled versions require CUDA 8.0 and cuDNN 5.1. For other versions, the
     TensorFlow C library will have to be built from source (see below).
 
     -   Linux:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-1.0.0.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.0.0.tar.gz)
+        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-1.1.0.tar.gz),
+        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.1.0.tar.gz)
     -   OS X
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-1.0.0.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-darwin-x86_64-1.0.0.tar.gz)
+        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-1.1.0.tar.gz),
 
     The following shell snippet downloads and extracts into `/usr/local`:
 
     ```sh
     TF_TYPE="cpu" # Set to "gpu" for GPU support
     curl -L \
-      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.0.0.tar.gz" |
+      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.1.0.tar.gz" |
     sudo tar -C /usr/local -xz
     ```
 
@@ -41,20 +39,7 @@ Construct and execute TensorFlow graphs in Go.
 
 ### Installing into locations other than `/usr/local`
 
-The TensorFlow C library (`libtensorflow.so`) needs to be available at build
-time (e.g., `go build`) and run time (`go test` or executing binaries). If the
-library has not been extracted into `/usr/local`, then it needs to be made
-available through the `LIBRARY_PATH` environment variable at build time and the
-`LD_LIBRARY_PATH` environment variable (`DYLD_LIBRARY_PATH` on OS X) at run
-time.
-
-For example, if the TensorFlow C library was extracted into `/dir`, then:
-
-```sh
-export LIBRARY_PATH=/dir/lib
-export LD_LIBRARY_PATH=/dir/lib   # For Linux
-export DYLD_LIBRARY_PATH=/dir/lib # For OS X
-```
+Refer to [Installing TensorFlow for Go](https://www.tensorflow.org/install/install_go)
 
 ## Building the TensorFlow C library from source
 
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index e65619e80b54a7285b5e1cecafc55cfbe8a72117..46c600eab17c6c467d0b3a3312f848541f382e80 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -185,11 +185,11 @@ func (g *Graph) AddOperation(args OpSpec) (*Operation, error) {
 			return nil, fmt.Errorf("%v (memory will be leaked)", err)
 		}
 	}
-	op := &Operation{
-		c: C.TF_FinishOperation(cdesc, status.c),
-		g: g,
+	c := C.TF_FinishOperation(cdesc, status.c)
+	if err := status.Err(); err != nil {
+		return nil, err
 	}
-	return op, status.Err()
+	return &Operation{c, g}, nil
 }
 
 func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, value interface{}) error {
diff --git a/tensorflow/go/lib.go b/tensorflow/go/lib.go
index 551cfa0b019a6f864555b3a0473be41d6ade3abe..2800eded60b75ecf3bcf09312f4a8bedbcbbae92 100644
--- a/tensorflow/go/lib.go
+++ b/tensorflow/go/lib.go
@@ -18,14 +18,4 @@ package tensorflow
 
 // #cgo LDFLAGS: -ltensorflow
 // #cgo CFLAGS: -I${SRCDIR}/../../
-//
-// // TODO(ashankar): Remove this after TensorFlow 1.1 has been released.
-// // Till then, the TensorFlow C API binary releases do not contain
-// // the TF_DeletePRunHandle symbol. We work around that by
-// // implementing the equivalent in session.cpp
-// extern void tfDeletePRunHandle(const char*);
 import "C"
-
-func deletePRunHandle(h *C.char) {
-	C.tfDeletePRunHandle(h)
-}
diff --git a/tensorflow/go/op/op_test.go b/tensorflow/go/op/op_test.go
index 65877dca96bc1c38ca70116867195450cf72e763..2451ba360699a7ac24f64209339e7b4f92ffb548 100644
--- a/tensorflow/go/op/op_test.go
+++ b/tensorflow/go/op/op_test.go
@@ -19,6 +19,7 @@ limitations under the License.
 package op
 
 import (
+	"strings"
 	"testing"
 
 	tf "github.com/tensorflow/tensorflow/tensorflow/go"
@@ -33,3 +34,27 @@ func TestPlaceholder(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestAddOperationFailure(t *testing.T) {
+	// Inspired from https://github.com/tensorflow/tensorflow/issues/9931
+	s := NewScope()
+
+	resize := ResizeArea(s, Placeholder(s, tf.Float), Const(s, []int64{80, 80}))
+	if err := s.Err(); err == nil {
+		t.Fatal("ResizeArea expects an int32 Tensor for size, should fail when an int64 is provided")
+	}
+	// And any use of resize should panic with an error message more informative than SIGSEGV
+	defer func() {
+		r := recover()
+		if r == nil {
+			return
+		}
+		s, ok := r.(string)
+		if ok && strings.Contains(s, "see Scope.Err() for details") {
+			return
+		}
+		t.Errorf("Expected panic string to Scope.Err(), found %T: %q", r, r)
+	}()
+	_ = resize.Shape()
+	t.Errorf("resize.Shape() should have paniced since the underlying Operation was not created")
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 4dc4b40e97937f91e5354c02798019937511f356..9c67c6cd4a0c93a5ec65304743249b571a54d09d 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -57,7 +57,7 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 // Requires `updates.shape = indices.shape + ref.shape[1:]`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/ScatterAdd.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
 // </div>
 //
 // Arguments:
@@ -195,6 +195,19 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 	return op.Output(0)
 }
 
+// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
+type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
 // Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
 //
 // Arguments:
@@ -211,20 +224,36 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 //   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
 // `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
 // `sum_per_d(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
 		Input: []tf.Input{
 			gradients, inputs, min, max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
 // Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
 // and `max` to 'outputs' tensor of same shape as `inputs`.
@@ -232,17 +261,23 @@ func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output
 // [min; max] is the clamping range for the 'inputs' data.  Op divides this range
 // into 255 steps (total of 256 values), then replaces each 'inputs' value with the
 // closest of the quantized step values.
+// 'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
 // This operation has a gradient and thus allows for training `min` and `max` values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output) (outputs tf.Output) {
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
 			inputs, min, max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -653,14 +688,14 @@ func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides
 //
 // For example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3, 4]]]]
 //
 // ```
 //
 // This operation will output a tensor of shape `[1, 2, 2, 1]`:
 //
-// ```prettyprint
+// ```
 //    [[[[1], [2]],
 //      [[3], [4]]]]
 // ```
@@ -672,14 +707,14 @@ func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides
 //
 // For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 // ```
 //
 // This operation, for block size of 2, will return the following tensor of shape
 // `[1, 2, 2, 3]`
 //
-// ```prettyprint
+// ```
 //    [[[[1, 2, 3], [4, 5, 6]],
 //      [[7, 8, 9], [10, 11, 12]]]]
 //
@@ -687,7 +722,7 @@ func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides
 //
 // Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
 //
-// ```prettyprint
+// ```
 // x =  [[[[1, 2, 3, 4],
 //        [5, 6, 7, 8]],
 //       [[9, 10, 11, 12],
@@ -696,7 +731,7 @@ func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides
 //
 // the operator will return the following tensor of shape `[1 4 4 1]`:
 //
-// ```prettyprint
+// ```
 // x = [[ [1],   [2],  [5],  [6]],
 //      [ [3],   [4],  [7],  [8]],
 //      [ [9],  [10], [13],  [14]],
@@ -784,26 +819,26 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 // (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
 //     `crops = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 // ```
 //
 // The output tensor has shape `[1, 2, 2, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [2]], [[3], [4]]]]
 // ```
 //
 // (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
 //     `crops = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 // ```
 //
 // The output tensor has shape `[1, 2, 2, 3]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3], [4, 5, 6]],
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
@@ -811,7 +846,7 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 // (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
 //     `crops = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [3]], [[9], [11]]],
 //      [[[2], [4]], [[10], [12]]],
 //      [[[5], [7]], [[13], [15]]],
@@ -820,7 +855,7 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //
 // The output tensor has shape `[1, 4, 4, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[1],   [2],  [3],  [4]],
 //      [[5],   [6],  [7],  [8]],
 //      [[9],  [10], [11],  [12]],
@@ -830,7 +865,7 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 // (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
 //     `crops = [[0, 0], [2, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
 //      [[[0], [2], [4]]], [[[0], [10], [12]]],
 //      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -839,7 +874,7 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //
 // The output tensor has shape `[2, 2, 4, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]]],
 //      [[[9],  [10], [11],  [12]],
@@ -897,32 +932,32 @@ func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops
 //
 // (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [2]], [[3], [4]]]]
 // ```
 //
 // The output tensor has shape `[4, 1, 1, 1]` and value:
 //
-// ```prettyprint
+// ```
 // [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 // ```
 //
 // (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3], [4, 5, 6]],
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
 //
 // The output tensor has shape `[4, 1, 1, 3]` and value:
 //
-// ```prettyprint
+// ```
 // [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 // ```
 //
 // (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]],
 //       [[9],  [10], [11],  [12]],
@@ -931,7 +966,7 @@ func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops
 //
 // The output tensor has shape `[4, 2, 2, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [3]], [[9], [11]]],
 //      [[[2], [4]], [[10], [12]]],
 //      [[[5], [7]], [[13], [15]]],
@@ -940,7 +975,7 @@ func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops
 //
 // (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]]],
 //      [[[9],  [10], [11],  [12]],
@@ -949,7 +984,7 @@ func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops
 //
 // The output tensor has shape `[8, 1, 2, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
 //      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 // ```
@@ -1142,34 +1177,34 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 // (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
 //     `paddings = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [2]], [[3], [4]]]]
 // ```
 //
 // The output tensor has shape `[4, 1, 1, 1]` and value:
 //
-// ```prettyprint
+// ```
 // [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 // ```
 //
 // (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
 //     `paddings = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3], [4, 5, 6]],
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
 //
 // The output tensor has shape `[4, 1, 1, 3]` and value:
 //
-// ```prettyprint
+// ```
 // [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 // ```
 //
 // (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
 //     `paddings = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]],
 //       [[9],  [10], [11],  [12]],
@@ -1178,7 +1213,7 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 //
 // The output tensor has shape `[4, 2, 2, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [3]], [[9], [11]]],
 //      [[[2], [4]], [[10], [12]]],
 //      [[[5], [7]], [[13], [15]]],
@@ -1188,7 +1223,7 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 // (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
 //     paddings = `[[0, 0], [2, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]]],
 //      [[[9],  [10], [11],  [12]],
@@ -1197,7 +1232,7 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 //
 // The output tensor has shape `[8, 1, 3, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
 //      [[[0], [2], [4]]], [[[0], [10], [12]]],
 //      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -1220,65 +1255,6 @@ func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddin
 	return op.Output(0)
 }
 
-// ListDiffAttr is an optional argument to ListDiff.
-type ListDiffAttr func(optionalAttr)
-
-// ListDiffOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Computes the difference between two lists of numbers or strings.
-//
-// Given a list `x` and a list `y`, this operation returns a list `out` that
-// represents all values that are in `x` but not in `y`. The returned list `out`
-// is sorted in the same order that the numbers appear in `x` (duplicates are
-// preserved). This operation also returns a list `idx` that represents the
-// position of each `out` element in `x`. In other words:
-//
-// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-//
-// For example, given this input:
-//
-// ```prettyprint
-// x = [1, 2, 3, 4, 5, 6]
-// y = [1, 3, 5]
-// ```
-//
-// This operation would return:
-//
-// ```prettyprint
-// out ==> [2, 4, 6]
-// idx ==> [1, 3, 5]
-// ```
-//
-// Arguments:
-//	x: 1-D. Values to keep.
-//	y: 1-D. Values to remove.
-//
-// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
-func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ListDiff",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // SqueezeAttr is an optional argument to Squeeze.
 type SqueezeAttr func(optionalAttr)
 
@@ -1304,14 +1280,14 @@ func SqueezeSqueezeDims(value []int64) SqueezeAttr {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 // shape(squeeze(t)) ==> [2, 3]
 // ```
 //
 // Or, to remove specific size 1 dimensions:
 //
-// ```prettyprint
+// ```
 // # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 // shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 // ```
@@ -1426,7 +1402,7 @@ func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 't' is [[1, 2, 3], [4, 5, 6]].
 // # 'paddings' is [[1, 1]], [2, 2]].
 // # 'mode' is SYMMETRIC.
@@ -1510,7 +1486,7 @@ func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'input' tensor is [[True, False]
 // #                    [True, False]]
 // # 'input' has two true values, so output has two coordinates.
@@ -1727,7 +1703,7 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
 // # tensor 't' has shape [9]
 // reshape(t, [3, 3]) ==> [[1, 2, 3],
@@ -1911,20 +1887,17 @@ func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 
 // Gather values or slices from `params` according to `indices`.
 //
-// `params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.
+// `indices` is an integer tensor containing indices into `params`.  The last
+// dimension of `indices` can be at most the rank of `params`:
 //
-// `indices` must be integer tensor, containing indices into `params`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//     indices.shape[-1] <= params.rank
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `params`.
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] = params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
 //
-// Produces an output tensor with shape
-//
-// ```
-// [d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].
-// ```
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
 //
 // Some examples below.
 //
@@ -2004,11 +1977,11 @@ func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 // ```
 //
 // Arguments:
-//	params: `P-D`.  The tensor from which to gather values.
-//	indices: `Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`.
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
 //
-// Returns `(P+Q-K-1)-D`.  Values from `params` gathered from indices given by
-// `indices`.
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
 func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -2155,7 +2128,7 @@ func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'input' is [[1, 0, 0, 0]
 //               [0, 2, 0, 0]
 //               [0, 0, 3, 0]
@@ -2351,21 +2324,21 @@ func Split(scope *Scope, split_dim tf.Output, value tf.Output, num_split int64)
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'x' is [2, 2, 7]
 // # 'y' is [2, 3, 7]
 // # 'z' is [2, 5, 7]
 // concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
 // ```
 //
+// This is typically used by gradient computations for a concat operation.
+//
 // Arguments:
 //	concat_dim: The dimension along which to concatenate.
 //	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
 // Returns The `N` int32 vectors representing the starting offset
-//         of input tensors within the concatenated output.
-//
-// This is typically used by gradient computations for a concat operation.
+// of input tensors within the concatenated output.
 func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -2420,7 +2393,7 @@ func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.O
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'x' is [[1, 4]]
 // # 'y' is [[2, 5]]
 // # 'z' is [[3, 6]]
@@ -2555,7 +2528,7 @@ func UniqueOutIdx(value tf.DataType) UniqueAttr {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
 // y, idx = unique(x)
 // y ==> [1, 2, 4, 7, 8]
@@ -2689,7 +2662,7 @@ func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce per batch.
+//	num_sampled: Number of candidates to produce.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
@@ -2847,7 +2820,7 @@ func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSampler
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
+//	num_sampled: Number of candidates to randomly sample.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
@@ -2920,7 +2893,7 @@ func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
+//	num_sampled: Number of candidates to randomly sample.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
@@ -2973,7 +2946,10 @@ func AbortExitWithoutError(value bool) AbortAttr {
 	}
 }
 
-// Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal.
+// Raise a exception to abort the process when called.
+//
+// If exit_without_error is true, the process will exit normally,
+// otherwise it will exit with a SIGABORT signal.
 //
 // Returns nothing but an exception.
 //
@@ -3035,14 +3011,14 @@ func ControlTrigger(scope *Scope) (o *tf.Operation) {
 //
 // For example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [2]],
 //       [[3], [4]]]]
 // ```
 //
 // This operation will output a tensor of shape `[1, 1, 1, 4]`:
 //
-// ```prettyprint
+// ```
 // [[[[1, 2, 3, 4]]]]
 // ```
 //
@@ -3053,7 +3029,7 @@ func ControlTrigger(scope *Scope) (o *tf.Operation) {
 //
 // For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3], [4, 5, 6]],
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
@@ -3061,13 +3037,13 @@ func ControlTrigger(scope *Scope) (o *tf.Operation) {
 // This operation, for block_size of 2, will return the following tensor of shape
 // `[1, 1, 1, 12]`
 //
-// ```prettyprint
+// ```
 // [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 // ```
 //
 // Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [5],  [6]],
 //       [[3],   [4],  [7],  [8]],
 //       [[9],  [10], [13],  [14]],
@@ -3076,7 +3052,7 @@ func ControlTrigger(scope *Scope) (o *tf.Operation) {
 //
 // the operator will return the following tensor of shape `[1 2 2 4]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3, 4],
 //        [5, 6, 7, 8]],
 //       [[9, 10, 11, 12],
@@ -3102,37 +3078,34 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 	return op.Output(0)
 }
 
-// Creates a new tensor by applying sparse `updates` to individual
+// Scatter `updates` into a new (initially zero) tensor according to `indices`.
 //
-// values or slices within a zero tensor of the given `shape` tensor according to
+// Creates a new tensor by applying sparse `updates` to individual
+// values or slices within a zero tensor of the given `shape` according to
 // indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
 // operator which extracts values or slices from a given tensor.
 //
-// TODO(simister): Add a link to Variable.__getitem__ documentation on slice
-// syntax.
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates.
 //
-// `shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank
-// `Q`.
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
 //
-// `indices` must be integer tensor, containing indices into `shape`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//     indices.shape[-1] <= shape.rank
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `shape`.
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
 //
-// `updates` is Tensor of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].
-// ```
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
 //
 // The simplest form of scatter is to insert individual elements in a tensor by
 // index. For example, say we want to insert 4 scattered elements in a rank-1
 // tensor with 8 elements.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/ScatterNd1.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
 // </div>
 //
 // In Python, this scatter operation would look like this:
@@ -3143,7 +3116,7 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //     shape = tf.constant([8])
 //     scatter = tf.scatter_nd(indices, updates, shape)
 //     with tf.Session() as sess:
-//       print sess.run(scatter)
+//       print(sess.run(scatter))
 // ```
 //
 // The resulting tensor would look like this:
@@ -3155,7 +3128,7 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 // rank-3 tensor with two matrices of new values.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/ScatterNd2.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
 // </div>
 //
 // In Python, this scatter operation would look like this:
@@ -3169,7 +3142,7 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //     shape = tf.constant([4, 4, 4])
 //     scatter = tf.scatter_nd(indices, updates, shape)
 //     with tf.Session() as sess:
-//       print sess.run(scatter)
+//       print(sess.run(scatter))
 // ```
 //
 // The resulting tensor would look like this:
@@ -3180,11 +3153,9 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
 //
 // Arguments:
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as tensor. A tensor of updated values
-// to store in ref.
-//	shape: A vector. The shape of the resulting tensor.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
 //
 // Returns A new tensor with the given shape and updates applied according
 // to the indices.
@@ -3410,6 +3381,18 @@ func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
 	}
 }
 
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation items have longer input sequences than output sequences
+// are ignored by returning zero-gradient for those items.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
 // Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
 //
 // the gradient.  This class performs the softmax operation for you, so inputs
@@ -3468,10 +3451,10 @@ func StageSharedName(value string) StageAttr {
 	}
 }
 
-// Stage values similar to a lightweight Enqueue.  The basic functionality of this
+// Stage values similar to a lightweight Enqueue.
 //
-// Op is similar to a queue with many fewer capabilities and options.  This Op is
-// optimized for performance.
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
 //
 // Arguments:
 //	values: a list of tensors
@@ -3514,11 +3497,20 @@ func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
 	}
 }
 
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
 // Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
 //
 // Attributes [min; max] define the clamping range for the 'inputs' data.  Op
 // divides this range into 255 steps (total of 256 values), then replaces each
 // 'inputs' value with the closest of the quantized step values.
+// 'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
 // Quantization is called fake since the output is still in floating point.
 func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
@@ -3647,9 +3639,10 @@ func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype t
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.  This enables
+// Delete the TensorArray from its resource container.
 //
-// the user to close and release the resource in the middle of a step/run.
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
 //	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
@@ -3730,7 +3723,7 @@ func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSam
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
+//	num_sampled: Number of candidates to randomly sample.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
@@ -3816,7 +3809,7 @@ func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'diagonal' is [1, 2, 3, 4]
 // tf.diag(diagonal) ==> [[1, 0, 0, 0]
 //                        [0, 2, 0, 0]
@@ -4051,7 +4044,7 @@ func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtyp
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'input' is [[[1, 0, 0, 0]
 //                [0, 2, 0, 0]
 //                [0, 0, 3, 0]
@@ -4171,24 +4164,24 @@ func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
 	}
 }
 
-// Dequeues n tuples of one or more tensors from the given queue.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
 // This operation is not supported by all queues.  If a queue does not support
 // DequeueUpTo, then an Unimplemented error is returned.
 //
-// If the queue is closed and there are more than 0 but less than n elements
-// remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If the queue
-// is closed and there are 0 elements left in the queue, then an OutOfRange
-// error is returned just like in QueueDequeueMany.  Otherwise the behavior
-// is identical to QueueDequeueMany:
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
 //
 // This operation concatenates queue-element component tensors along the
 // 0th dimension to make a single component tensor.  All of the components
 // in the dequeued tuple will have size n in the 0th dimension.
 //
-// This operation has k outputs, where k is the number of components in
-// the tuples stored in the given queue, and output i is the ith
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
 // component of the dequeued tuple.
 //
 // Arguments:
@@ -4256,20 +4249,20 @@ func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
 	}
 }
 
-// Dequeues n tuples of one or more tensors from the given queue.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// If the queue is closed and there are fewer than n elements, then an
+// If the queue is closed and there are fewer than `n` elements, then an
 // OutOfRange error is returned.
 //
 // This operation concatenates queue-element component tensors along the
 // 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
+// in the dequeued tuple will have size `n` in the 0th dimension.
 //
-// This operation has k outputs, where k is the number of components in
-// the tuples stored in the given queue, and output i is the ith
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
 // component of the dequeued tuple.
 //
-// N.B. If the queue is empty, this operation will block until n elements
+// N.B. If the queue is empty, this operation will block until `n` elements
 // have been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
@@ -4352,6 +4345,77 @@ func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, opti
 	return scope.AddOperation(opspec)
 }
 
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
+//
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+//
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
+//
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceStridedSliceAssign",
+		Input: []tf.Input{
+			ref, begin, end, strides, value,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // UnstageAttr is an optional argument to Unstage.
 type UnstageAttr func(optionalAttr)
 
@@ -4371,10 +4435,10 @@ func UnstageSharedName(value string) UnstageAttr {
 	}
 }
 
-// Op is similar to a lightweight Dequeue.  The basic funtionality is similar to
+// Op is similar to a lightweight Dequeue.
 //
-// dequeue with many fewer capabilities and options.  This Op is optimized for
-// performance.
+// The basic funtionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
 func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -4752,7 +4816,7 @@ func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
 // particular,
 // `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
 //
-// ```prettyprint
+// ```
 // begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
 // end = [2, 4, x, x, -3, x]
 // strides = [1, 1, x, x, -1, 1]
@@ -4903,8 +4967,26 @@ func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow
 //               [51, 52], [61, 62]]
 // ```
 //
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/DynamicStitch.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
 // </div>
 func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
@@ -5118,177 +5200,139 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// Draw bounding boxes on a batch of images.
 //
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// Inputs are the logits, not probabilities.
+// For example, if an image is 100 x 200 pixels and the bounding box is
+// `[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
+// bounding box will be `(10, 40)` to `(50, 180)`.
+//
+// Parts of the bounding box may fall outside the image.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Type: "DrawBoundingBoxes",
 		Input: []tf.Input{
-			features, labels,
+			images, boxes,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
+	return op.Output(0)
 }
 
-// TensorSummaryLabels sets the optional labels attribute to value.
+// Convert one or more images from HSV to RGB.
 //
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
-	}
-}
-
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["display_name"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with a tensor.
+// See `rgb_to_hsv` for a description of the HSV encoding.
 //
 // Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
+		Type: "HSVToRGB",
 		Input: []tf.Input{
-			tensor,
+			images,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softplus gradients for a softplus operation.
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+//
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+//	contents: 0-D.  The GIF-encoded image.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softplus",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			features,
+			contents,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
 
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
+// DecodePngChannels sets the optional channels attribute to value.
 //
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["adj_x"] = value
+		m["channels"] = value
 	}
 }
 
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["adj_y"] = value
+		m["dtype"] = value
 	}
 }
 
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+// Accepted values are:
 //
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
-// It is computed as:
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
 //
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//	contents: 0-D.  The PNG-encoded image.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5297,9 +5341,9 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			x, y,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -5307,172 +5351,144 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+// Adjust the contrast of one or more images.
 //
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
 //
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// Contrast is adjusted independently for each channel of each image.
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
+//
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "AdjustContrastv2",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			images, contrast_factor,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
 }
 
-// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
-type AudioSpectrogramAttr func(optionalAttr)
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
 
-// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// value: Whether to return the squared magnitude or just the
-// magnitude. Using squared magnitude can avoid extra calculations.
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
 // If not specified, defaults to false
-func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["magnitude_squared"] = value
+		m["try_recover_truncated"] = value
 	}
 }
 
-// Produces a visualization of audio data over time.
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-// Spectrograms are a standard way of representing audio information as a series of
-// slices of frequency information, one slice for each window of time. By joining
-// these together into a sequence, they form a distinctive fingerprint of the sound
-// over time.
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
 //
-// This op expects to receive audio data as an input, stored as floats in the range
-// -1 to 1, together with a window width in samples, and a stride specifying how
-// far to move the window between slices. From this it generates a three
-// dimensional output. The lowest dimension has an amplitude value for each
-// frequency during that time slice. The next dimension is time, with successive
-// frequency slices. The final dimension is for the channels in the input, so a
-// stereo audio input would have two here for example.
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
 //
-// This means the layout when converted and saved as an image is rotated 90 degrees
-// clockwise from a typical spectrogram. Time is descending down the Y axis, and
-// the frequency decreases from left to right.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// Each value in the result represents the square root of the sum of the real and
-// imaginary parts of an FFT on the current window of samples. In this way, the
-// lowest dimension represents the power of each frequency in the current window,
-// and adjacent windows are concatenated in the next dimension.
+// Accepted values are:
 //
-// To get a more intuitive and visual look at what this operation does, you can run
-// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
-// resulting spectrogram as a PNG image.
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	input: Float representation of audio data.
-//	window_size: How wide the input window is in samples. For the highest efficiency
-// this should be a power of two, but other values are accepted.
-//	stride: How widely apart the center of adjacent sample windows should be.
+//	contents: 0-D.  The JPEG-encoded image.
 //
-// Returns 3D representation of the audio frequencies as an image.
-func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSpectrogram",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -5480,58 +5496,31 @@ func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, ou
 	return op.Output(0)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
-
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
 
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Computes the gradient of nearest neighbor interpolation.
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.
-//	reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be used in the gradient computation.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5540,58 +5529,52 @@ func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			grads, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
 
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -5599,246 +5582,208 @@ func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+// Returns the set of files matching one or more glob patterns.
+//
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
 //
 // Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
 //
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			gradients, inputs,
+			pattern,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// Shuffle dimensions of x according to a permutation.
 //
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
+		Type: "Transpose",
 		Input: []tf.Input{
-			input, grad, argmax,
+			x, perm,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			filename,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x == y) element-wise.
+// Store the input tensor in the state of the current session.
 //
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Equal",
+		Type: "GetSessionHandleV2",
 		Input: []tf.Input{
-			x, y,
+			value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
+// Adjust the hue of one or more images.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs 3D average pooling on the input.
+// Restore a Reader to its initial clean state.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	reader_handle: Handle to a Reader.
 //
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
+		Type: "ReaderResetV2",
 		Input: []tf.Input{
-			input,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Produces the max pool of the input tensor for quantized types.
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
 
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["description"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// TensorSummaryLabels sets the optional labels attribute to value.
+//
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["labels"] = value
+	}
+}
+
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
+//
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -5846,95 +5791,103 @@ func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output
 	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
+// Computes softplus gradients for a softplus operation.
 //
 // Arguments:
-//	x: a tensor of type T.
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "SoftplusGrad",
 		Input: []tf.Input{
-			x,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division.
-//
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mod",
+		Type: "Softplus",
 		Input: []tf.Input{
-			x, y,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
 //
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// Multiplies slices of two tensors in batches.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -5942,95 +5895,60 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
 
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["batch_dim"] = value
+		m["adjoint_a"] = value
 	}
 }
 
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```prettyprint
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
 //
-// ```prettyprint
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 //
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
 //
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
 //
 // Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
-//
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			input, seq_lengths,
+			a_indices, a_values, a_shape, b,
 		},
 		Attrs: attrs,
 	}
@@ -6038,96 +5956,82 @@ func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_d
 	return op.Output(0)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
+		Type: "Relu6",
 		Input: []tf.Input{
-			x, y,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor transferred by GraphTransferer.
+// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
+type AudioSpectrogramAttr func(optionalAttr)
+
+// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
 //
-// The graph specifications are serialized by protobuf as graph_transfer_info.
-// The implementation / limitations may differ for each platform
-// and each available peripheral.
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
-	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+// value: Whether to return the squared magnitude or just the
+// magnitude. Using squared magnitude can avoid extra calculations.
+// If not specified, defaults to false
+func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
+	return func(m optionalAttr) {
+		m["magnitude_squared"] = value
 	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
-	}
-	return outputs
 }
 
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+// Produces a visualization of audio data over time.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Spectrograms are a standard way of representing audio information as a series of
+// slices of frequency information, one slice for each window of time. By joining
+// these together into a sequence, they form a distinctive fingerprint of the sound
+// over time.
+//
+// This op expects to receive audio data as an input, stored as floats in the range
+// -1 to 1, together with a window width in samples, and a stride specifying how
+// far to move the window between slices. From this it generates a three
+// dimensional output. The lowest dimension has an amplitude value for each
+// frequency during that time slice. The next dimension is time, with successive
+// frequency slices. The final dimension is for the channels in the input, so a
+// stereo audio input would have two here for example.
+//
+// This means the layout when converted and saved as an image is rotated 90 degrees
+// clockwise from a typical spectrogram. Time is descending down the Y axis, and
+// the frequency decreases from left to right.
+//
+// Each value in the result represents the square root of the sum of the real and
+// imaginary parts of an FFT on the current window of samples. In this way, the
+// lowest dimension represents the power of each frequency in the current window,
+// and adjacent windows are concatenated in the next dimension.
+//
+// To get a more intuitive and visual look at what this operation does, you can run
+// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+// resulting spectrogram as a PNG image.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+//	input: Float representation of audio data.
+//	window_size: How wide the input window is in samples. For the highest efficiency
+// this should be a power of two, but other values are accepted.
+//	stride: How widely apart the center of adjacent sample windows should be.
+//
+// Returns 3D representation of the audio frequencies as an image.
+func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "AudioSpectrogram",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -6135,191 +6039,201 @@ func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Outpu
 	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
+// Computes the gradient of morphological 2-D dilation with respect to the input.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
+
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["epsilon"] = value
 	}
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["data_format"] = value
 	}
 }
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["is_training"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.
+//	reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be used in the gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			size,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
-//
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "MaxPool3DGradGrad",
 		Input: []tf.Input{
-			data,
+			orig_input, orig_output, grad,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Softsign",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["num_bits"] = value
 	}
 }
 
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
-//
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
 //
 // Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
 //
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "FakeQuantWithMinMaxArgsGradient",
 		Input: []tf.Input{
-			input, filter,
+			gradients, inputs,
 		},
 		Attrs: attrs,
 	}
@@ -6327,74 +6241,28 @@ func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, stri
 	return op.Output(0)
 }
 
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
-//
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
-	}
-}
-
-// Extracts crops from the input image tensor and bilinearly resizes them (possibly
-//
-// with aspect ratio change) to a common output size specified by `crop_size`. This
-// is more general than the `crop_to_bounding_box` op which extracts a fixed size
-// slice from the input image and does not allow resizing or aspect ratio change.
-//
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "MaxPoolGradWithArgmax",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
@@ -6402,47 +6270,28 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 	return op.Output(0)
 }
 
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
-
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
 //	padding: The type of padding algorithm to use.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
+		Type: "Dilation2DBackpropFilter",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6450,85 +6299,64 @@ func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+// Returns the truth value of (x == y) element-wise.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "Equal",
 		Input: []tf.Input{
-			value, bias,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
 
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the input.
+// Performs 3D average pooling on the input.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
+		Type: "AvgPool3D",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -6536,124 +6364,77 @@ func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output,
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be used in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			input, min_input, max_input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
 
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "Conv3DBackpropInputV2",
 		Input: []tf.Input{
-			shape,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6661,15 +6442,18 @@ func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, opti
 	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
+// Returns a tensor of ones with the same shape and type as x.
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "OnesLike",
 		Input: []tf.Input{
 			x,
 		},
@@ -6678,105 +6462,75 @@ func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
-
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+// Returns element-wise remainder of division.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	opspec := tf.OpSpec{
+		Type: "Mod",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the ids of the positions in sampled_candidates that match true_labels.
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
 //
 // Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
-//
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
+		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
-			true_classes, sampled_candidates,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6784,155 +6538,134 @@ func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
 //
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
+	}
+}
+
+// Reverses variable length slices.
 //
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 //
-// Arguments:
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// For example:
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Saves input tensors slices to disk.
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
 //
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
 //
-// Elements of the `shapes_and_slices` input must either be:
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
 //
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
+// In contrast, if:
 //
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
 //
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
 //
-// See also `Save`.
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
 //
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SaveSlices",
+		Type: "ReverseSequence",
 		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+			input, seq_lengths,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the rank of a tensor.
-//
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
-//
-// ```prettyprint
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
+// Computes the gradient for the rsqrt of `x` wrt its input.
 //
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rank",
+		Type: "RsqrtGrad",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or empty if the column is required.
+// Execute a sub graph on a remote processor transferred by GraphTransferer.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+// The graph specifications are serialized by protobuf as graph_transfer_info.
+// The implementation / limitations may differ for each platform
+// and each available peripheral.
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "RemoteFusedGraphExecute",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -6942,54 +6675,55 @@ func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, opt
 	}
 	var idx int
 	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
 		return
 	}
-	return output
+	return outputs
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
 
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// The backward operation for "BiasAdd" on the "bias" tensor.
-//
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
 // Arguments:
-//	out_backprop: Any number of dimensions.
-//
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "Conv3DBackpropFilterV2",
 		Input: []tf.Input{
-			out_backprop,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6997,174 +6731,175 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			json_examples,
+			size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
 //
-// Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "SerializeManySparse",
 		Input: []tf.Input{
-			serialized,
+			sparse_indices, sparse_values, sparse_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+// Makes its input available to the next iteration.
+//
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Acos",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			x,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```prettyprint
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```prettyprint
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+// Does nothing. Only useful as a placeholder for control edges.
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "NoOp",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softsign",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
 
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["little_endian"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
+// Resize `images` to `size` using bilinear interpolation.
 //
-// Arguments:
-//	bytes: All the elements must have the same length.
+// Input images can be of different types but output images are always float.
 //
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			bytes,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -7172,268 +6907,168 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
 
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// ProdKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
-//
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
+// Computes the product of elements across dimensions of a tensor.
 //
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "Prod",
 		Input: []tf.Input{
-			handle,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
-type ParseSingleSequenceExampleAttr func(optionalAttr)
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
 
-// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
 //
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
+		m["data_format"] = value
 	}
 }
 
-// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
 //
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// Arguments:
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
-	}
-}
-
-// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
 //
-// Arguments:
-//	serialized: A scalar containing a binary serialized SequenceExample proto.
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExample.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExample.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	debug_name: A scalar containing the name of the serialized proto.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty scalar if no name is available.
-func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleSequenceExample",
+		Type: "DepthwiseConv2dNative",
 		Input: []tf.Input{
-			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
+// CropAndResizeMethod sets the optional method attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["method"] = value
 	}
 }
 
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
 //
-// value: A second seed to avoid seed collision.
+// value: Value used for extrapolation, when applicable.
 // If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["extrapolation_value"] = value
 	}
 }
 
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Extracts crops from the input image tensor and bilinearly resizes them (possibly
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// with aspect ratio change) to a common output size specified by `crop_size`. This
+// is more general than the `crop_to_bounding_box` op which extracts a fixed size
+// slice from the input image and does not allow resizing or aspect ratio change.
+//
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7442,9 +7077,9 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "CropAndResize",
 		Input: []tf.Input{
-			shape, alpha,
+			image, boxes, box_ind, crop_size,
 		},
 		Attrs: attrs,
 	}
@@ -7452,60 +7087,47 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
 
-// RandomShuffleSeed sets the optional seed attribute to value.
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["data_format"] = value
 	}
 }
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
-//
-// ```prettyprint
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	value: The tensor to be shuffled.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "MaxPoolGrad",
 		Input: []tf.Input{
-			value,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -7513,188 +7135,150 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 	return op.Output(0)
 }
 
-// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+// Adds `bias` to `value`.
 //
-// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
-// to 'outputs' tensor of same shape as `inputs`.
+// This is a deprecated version of BiasAdd and will be soon removed.
 //
-// [min; max] is the clamping range for the 'inputs' data in the corresponding
-// depth channel.  Op divides this range into 255 steps (total of 256 values), then
-// replaces each 'inputs' value with the closest of the quantized step values.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
-// This operation has a gradient and thus allows for training `min` and `max` values.
-func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output) (outputs tf.Output) {
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsPerChannel",
+		Type: "BiasAddV1",
 		Input: []tf.Input{
-			inputs, min, max,
+			value, bias,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
 
-// TruncatedNormalSeed sets the optional seed attribute to value.
+// EncodeJpegFormat sets the optional format attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["format"] = value
 	}
 }
 
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// EncodeJpegQuality sets the optional quality attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["quality"] = value
 	}
 }
 
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+// EncodeJpegProgressive sets the optional progressive attribute to value.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
 //
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["window_size"] = value
+		m["optimize_size"] = value
 	}
 }
 
-// SkipgramMinCount sets the optional min_count attribute to value.
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
 //
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["min_count"] = value
+		m["chroma_downsampling"] = value
 	}
 }
 
-// SkipgramSubsample sets the optional subsample attribute to value.
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
 //
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["subsample"] = value
+		m["density_unit"] = value
 	}
 }
 
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
-//
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+// EncodeJpegXDensity sets the optional x_density attribute to value.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
-		Attrs: attrs,
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+// EncodeJpegYDensity sets the optional y_density attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["y_density"] = value
 	}
 }
 
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["xmp_metadata"] = value
 	}
 }
 
-// Outputs random values from a normal distribution. The parameters may each be a
+// JPEG-encode an image.
 //
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
 //
 // Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7703,9 +7287,9 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -7713,85 +7297,98 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
+// Gradients for batch normalization.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
-//
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			images,
+			t, m, v, gamma, backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
 
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// Computes the gradients of convolution with respect to the input.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "Conv2DBackpropInput",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -7799,48 +7396,60 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 	return op.Output(0)
 }
 
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["epsilon"] = value
 	}
 }
 
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
+		m["data_format"] = value
 	}
 }
 
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
+		m["is_training"] = value
 	}
 }
 
-// Multiply matrix "a" by matrix "b".
+// Batch normalization.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". This op is optimized for the case where at
-// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
-// matrix multiply on one platform was 30% zero values in the sparse matrix.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be used in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7849,259 +7458,185 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			a, b,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// value: Whether to use Adapative SDCA for the inner loop.
-// If not specified, defaults to false
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["seed"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// Proximal Stochastic Dual Coordinate Ascent, Shalev-Shwartz, Shai; Zhang, Tong.
-// 2012 arXiv1211.2717S: http://arxiv.org/pdf/1211.2717v1.pdf
-//
-//   Loss objective = \sum f_{i}(wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
 //
-// Adding vs. Averaging in Distributed Primal-Dual Optimization.
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan, Peter Richtarik,
-// Martin Takac http://arxiv.org/abs/1502.03508
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
 //
-// Stochastic Dual Coordinate Ascent with Adaptive Probabilities
-// Dominik Csiba, Zheng Qu, Peter Richtarik https://arxiv.org/abs/1502.08053
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe ommitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
-}
-
-// Computes the minimum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMin",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
+	opspec := tf.OpSpec{
+		Type: "Sigmoid",
+		Input: []tf.Input{
+			x,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
 
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["seed"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
 //
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes the ids of the positions in sampled_candidates that match true_labels.
 //
-// See also `RestoreSlice`.
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
 //
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{"num_true": num_true}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "ComputeAccidentalHits",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			true_classes, sampled_candidates,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["data_format"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
 //	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			input, size, paddings, filter,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -8109,689 +7644,1643 @@ func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, padd
 	return op.Output(0)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
-
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+// Computes the maximum along segments of a tensor.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
 //
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Delete the tensor specified by its handle in the session.
+// Saves input tensors slices to disk.
 //
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
 //
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
-
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// Elements of the `shapes_and_slices` input must either be:
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
 //
-// Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
 //
+// See also `Save`.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "SaveSlices",
 		Input: []tf.Input{
-			set1, set2,
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
-
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
+// Writes contents to the file at input filename. Creates file if not existing.
+//
+// Arguments:
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
+//
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteFile",
+		Input: []tf.Input{
+			filename, contents,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Number of unique elements along last dimension of input `set`.
-//
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+// Computes the Cholesky decomposition of one or more square matrices.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix Cholesky
+// decomposition above. The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "Cholesky",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
-
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+// Returns the rank of a tensor.
 //
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// This operation returns an integer representing the rank of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rank",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+// value: delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["field_delim"] = value
 	}
 }
 
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-//
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
-//
-// ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-// ```
-//
-// and
-//
-// ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-// ```
-//
-// then the final `SparseTensor` will be:
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-// ```
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
 //
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			sparse_handles,
+			records, tf.OutputList(record_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
 	}
+	return output
 }
 
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
-//
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
-//
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
+		Type: "DecodeJSONExample",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			json_examples,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
 //
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Softmax",
+		Type: "ParseTensor",
 		Input: []tf.Input{
-			logits,
+			serialized,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+	opspec := tf.OpSpec{
+		Type: "Acos",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+// to zero.
 //
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// The indicator function
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
-		Attrs: attrs,
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
+//	bytes: All the elements must have the same length.
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			tags, values,
+			bytes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Constructs a tensor by tiling a given tensor.
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
+
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues a tuple of one or more tensors from the given queue.
+//
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueDequeueV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
+	}
+	return components
+}
+
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
+
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExample.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExample.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleSequenceExample",
+		Input: []tf.Input{
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+}
+
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
+
+// RandomGammaSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
+//
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
+//
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGamma",
+		Input: []tf.Input{
+			shape, alpha,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
+//
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```prettyprint
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
+//
+// Arguments:
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffle",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
+type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+//
+// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
+// to 'outputs' tensor of same shape as `inputs`.
+//
+// [min; max] is the clamping range for the 'inputs' data in the corresponding
+// depth channel.  Op divides this range into 255 steps (total of 256 values), then
+// replaces each 'inputs' value with the closest of the quantized step values.
+// 'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
+//
+// This operation has a gradient and thus allows for training `min` and `max` values.
+func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsPerChannel",
+		Input: []tf.Input{
+			inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncatedNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
+//
+// Arguments:
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
+//
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParameterizedTruncatedNormal",
+		Input: []tf.Input{
+			shape, means, stdevs, minvals, maxvals,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
+
+// EncodePngCompression sets the optional compression attribute to value.
+//
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
+	return func(m optionalAttr) {
+		m["compression"] = value
+	}
+}
+
+// PNG-encode an image.
+//
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
+//
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
+//
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodePng",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
+
+// RandomUniformIntSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniformInt",
+		Input: []tf.Input{
+			shape, minval, maxval,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
+
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". This op is optimized for the case where at
+// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
+// matrix multiply on one platform was 30% zero values in the sparse matrix.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+//
+// value: Whether to use Adapative SDCA for the inner loop.
+// If not specified, defaults to false
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+	return func(m optionalAttr) {
+		m["adaptative"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe ommitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaOptimizer",
+		Input: []tf.Input{
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+}
+
+// Computes the minimum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMin",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Restore",
+		Input: []tf.Input{
+			file_pattern, tensor_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1),
+// which exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
+//
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedResizeAndPadConv2D",
+		Input: []tf.Input{
+			input, size, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Tile",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			input, multiples,
+			set1, set2_indices, set2_values, set2_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the element-wise min of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// Delete the tensor specified by its handle in the session.
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	handle: The handle for a tensor stored in the session state.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
+		Type: "DeleteSessionTensor",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the set of files matching one or more glob patterns.
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
+
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `Tensor` inputs.
 //
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			pattern,
+			set1, set2,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the gradient of the sigmoid of `x` wrt its input.
-//
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
-		Input: []tf.Input{
-			x, y,
-		},
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// Number of unique elements along last dimension of input `set`.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
+//
+// Arguments:
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
+		Type: "SetSize",
 		Input: []tf.Input{
-			basename, shard, num_shards,
+			set_indices, set_values, set_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
 
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["container"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
 //
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
 //
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
 //
+// Arguments:
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			sparse_handles,
 		},
 		Attrs: attrs,
 	}
@@ -8799,915 +9288,1052 @@ func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_value
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "AddSparseToTensorsMap",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			sparse_indices, sparse_values, sparse_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// Computes softmax activations.
 //
-// Graphically this is equivalent to doing
+// For each batch `i` and class `j` we have
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "Softmax",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			logits,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
 
-// IdentityReaderV2Container sets the optional container attribute to value.
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["shapes"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["capacity"] = value
 	}
 }
 
-// A Reader that outputs the queued work as both the key and value.
-//
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
+}
 
-		Attrs: attrs,
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Performs a padding as a preprocess during a convolution.
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
 //
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
-		Input: []tf.Input{
-			input, paddings, filter,
-		},
+		Type: "RandomShuffleQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
 // Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			l, grad,
+			tags, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns immutable tensor from memory region.
+// Constructs a tensor by tiling a given tensor.
 //
-// The current implementation memmaps the tensor from a file.
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
 //
 // Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
+		Type: "Tile",
+		Input: []tf.Input{
+			input, multiples,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
+// Returns the element-wise min of two SparseTensors.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			serialized_sparse,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// Batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// Computes the gradient of the sigmoid of `x` wrt its input.
 //
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "SigmoidGrad",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+// Generate a sharded filename. The filename is printf formatted as
+//
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
+		Type: "ShardedFilename",
 		Input: []tf.Input{
-			basename, num_shards,
+			basename, shard, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Saves the input tensors to disk.
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
 //
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// See also `SaveSlices`.
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToSparseSetOperation",
+		Input: []tf.Input{
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			a_indices, a_values, a_shape, b,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
 
-// MaxPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
+// ListDiffOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["out_idx"] = value
 	}
 }
 
-// Performs max pooling on the input.
+// Computes the difference between two lists of numbers or strings.
+//
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
+//
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
+//
+// For example, given this input:
+//
+// ```
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
+// ```
+//
+// This operation would return:
+//
+// ```
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
+// ```
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
+		Type: "ListDiff",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Merges summaries.
+// Generates sparse cross from a list of sparse and dense tensors.
 //
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
 //
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Encode audio data using the WAV file format.
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
 //
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
+// `index  0  1  2  3  4`
 //
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeWav",
-		Input: []tf.Input{
-			audio, sample_rate,
-		},
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
 //
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
 //
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
-		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
-		},
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Read an element from the TensorArray into output `value`.
+// Performs fractional max pooling on the input.
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
 //
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
 //
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
+		Type: "FractionalMaxPool",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
+// Concatenates a list of `SparseTensor` along the specified dimension.
 //
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
 //
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
 
-// Conv3DDataFormat sets the optional data_format attribute to value.
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
 //
-// Our Conv3D implements a form of cross-correlation.
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
 //
-// Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
-		Input: []tf.Input{
-			input, filter,
-		},
+		Type: "IdentityReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// L2 Loss.
-//
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// Performs a padding as a preprocess during a convolution.
 //
-//     output = sum(t ** 2) / 2
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
-//
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "L2Loss",
-		Input: []tf.Input{
-			t,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x >= y) element-wise.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "FusedPadConv2D",
 		Input: []tf.Input{
-			x, y,
+			input, paddings, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			input,
+			l, grad,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes numerical negative value element-wise.
+// Returns immutable tensor from memory region.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Neg",
-		Input: []tf.Input{
-			x,
-		},
-	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
-
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
-	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
-	}
-}
-
-// Deletes the resource specified by the handle.
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
 //
-// All subsequent operations using the resource will result in a NotFound
-// error status.
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// Arguments:
-//	resource: handle to the resource to delete.
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			resource,
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+// Batch normalization.
 //
-// want to use Nesterov momentum.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			t, m, v, beta, gamma,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns element-wise integer closest to x.
-//
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
-//
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rint",
+		Type: "ShardedFilespec",
 		Input: []tf.Input{
-			x,
+			basename, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Fast Fourier transform.
+// Saves the input tensors to disk.
 //
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// See also `SaveSlices`.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT",
+		Type: "Save",
 		Input: []tf.Input{
-			input,
+			filename, tensor_names, tf.OutputList(data),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
 
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+// MaxPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["data_format"] = value
 	}
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
-//
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// Performs max pooling on the input.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	input: 4-D input to pool over.
 //	ksize: The size of the window for each dimension of the input tensor.
 //	strides: The stride of the sliding window for each dimension of the
 // input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9716,188 +10342,124 @@ func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []i
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "MaxPool",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-//
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+// Merges summaries.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
 //
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "MergeSummary",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
-
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// Encode audio data using the WAV file format.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalAvgPool function.
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			audio, sample_rate,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
+// The gradient operator for the SparseAdd op.
 //
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReorder",
+		Type: "SparseAddGrad",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape,
+			backprop_val_grad, a_indices, b_indices, sum_indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// PackAttr is an optional argument to Pack.
-type PackAttr func(optionalAttr)
-
-// PackAxis sets the optional axis attribute to value.
-//
-// value: Dimension along which to pack.  Negative values wrap around, so the
-// valid range is `[-(R+1), R+1)`.
-// If not specified, defaults to 0
-func PackAxis(value int64) PackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
-//
-// Packs the `N` tensors in `values` into a tensor with rank one higher than each
-// tensor in `values`, by packing them along the `axis` dimension.
-// Given a list of tensors of shape `(A, B, C)`;
-//
-// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-// Etc.
-//
-// For example:
-//
-// ```prettyprint
-// # 'x' is [1, 4]
-// # 'y' is [2, 5]
-// # 'z' is [3, 6]
-// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-// ```
-//
-// This is the opposite of `unpack`.
+// Read an element from the TensorArray into output `value`.
 //
 // Arguments:
-//	values: Must be of same shape and type.
+//	handle: The handle to a TensorArray.
 //
-// Returns The packed tensor.
-func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Pack",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -9905,138 +10467,139 @@ func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Out
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
+//
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
+		Type: "SparseDenseCwiseAdd",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
 
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+// Conv3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
-// Arguments:
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Our Conv3D implements a form of cross-correlation.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Arguments:
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// L2 Loss.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+// Computes half the L2 norm of a tensor without the `sqrt`:
 //
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//     output = sum(t ** 2) / 2
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	t: Typically 2-D, but may have any dimensions.
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
+		Type: "L2Loss",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
+			t,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes rectified linear gradients for a Relu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
-//
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReluGrad",
+		Type: "Relu",
 		Input: []tf.Input{
-			gradients, features,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// Returns the truth value of (x >= y) element-wise.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "GreaterEqual",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -10045,168 +10608,143 @@ func ReciprocalGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix Cholesky
-// decomposition above. The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
+//
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
 //
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "StringToHashBucketStrong",
 		Input: []tf.Input{
 			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes contents to the file at input filename. Creates file if not existing.
-//
-// Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
+// Computes numerical negative value element-wise.
 //
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "WriteFile",
+		Type: "Neg",
 		Input: []tf.Input{
-			filename, contents,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
-//
-// For example:
-//
-// ```prettyprint
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [3] or 'dims' is -1
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
 //
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse.
+//	resource: handle to the resource to delete.
 //
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReverseV2",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			tensor, axis,
+			resource,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
 
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// want to use Nesterov momentum.
 //
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
+//	accum: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10215,35 +10753,60 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Inverse 3D fast Fourier transform.
+// Returns element-wise integer closest to x.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
+//
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rint",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
 //
 // Arguments:
 //	input: A complex64 tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
+// Equivalent to np.fft.fft
 // @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "FFT",
 		Input: []tf.Input{
 			input,
 		},
@@ -10252,110 +10815,130 @@ func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
 //
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
 //
 // Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
+		Type: "MaxPoolWithArgmax",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+//
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
+//
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
 //
 // Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
+		Type: "SelfAdjointEig",
 		Input: []tf.Input{
-			input, grad, argmax,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the input.
+// Computes gradient of the FractionalAvgPool function.
+//
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
@@ -10363,103 +10946,80 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+// Reorders a SparseTensor into the canonical, row-major ordering.
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
 //
-// The hash function is deterministic on the content of the string within the
-// process.
+// Reordering does not affect the shape of the SparseTensor.
 //
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
 //
 // Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
 //
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "SparseReorder",
 		Input: []tf.Input{
-			string_tensor,
+			input_indices, input_values, input_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
+// PackAttr is an optional argument to Pack.
+type PackAttr func(optionalAttr)
 
-// CumsumReverse sets the optional reverse attribute to value.
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+// PackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to pack.  Negative values wrap around, so the
+// valid range is `[-(R+1), R+1)`.
+// If not specified, defaults to 0
+func PackAxis(value int64) PackAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["axis"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
+// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
 //
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-// ```prettyprint
-// tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
-// ```
+// Packs the `N` tensors in `values` into a tensor with rank one higher than each
+// tensor in `values`, by packing them along the `axis` dimension.
+// Given a list of tensors of shape `(A, B, C)`;
 //
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-// ```prettyprint
-// tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
-// ```
+// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+// Etc.
 //
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-// ```prettyprint
-// tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
-// ```
-// This is more efficient than using separate `tf.reverse` ops.
+// For example:
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
-// ```prettyprint
-// tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
 // ```
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+// # 'x' is [1, 4]
+// # 'y' is [2, 5]
+// # 'z' is [3, 6]
+// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+// ```
+//
+// This is the opposite of `unpack`.
+//
+// Arguments:
+//	values: Must be of same shape and type.
+//
+// Returns The packed tensor.
+func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10468,9 +11028,9 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "Pack",
 		Input: []tf.Input{
-			x, axis,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
@@ -10478,42 +11038,41 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// Deprecated. Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_type"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
-//
-// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+// Computes Quantized Rectified Linear: `max(features, 0)`
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
 //
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10522,192 +11081,173 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "QuantizedRelu",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// 3D fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
+// Computes rectified linear gradients for a Relu operation.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT3D",
+		Type: "ReluGrad",
 		Input: []tf.Input{
-			input,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SizeAttr is an optional argument to Size.
-type SizeAttr func(optionalAttr)
-
-// SizeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func SizeOutType(value tf.DataType) SizeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the size of a tensor.
-//
-// This operation returns an integer representing the number of elements in
-// `input`.
-//
-// For example:
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// ```prettyprint
-// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-// size(t) ==> 12
-// ```
-func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Size",
+		Type: "ReciprocalGrad",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// Reverses specific dimensions of a tensor.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// For example:
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+// # 'dims' is [3] or 'dims' is -1
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
 //
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Get the value of the tensor specified by its handle.
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "ReverseV2",
 		Input: []tf.Input{
-			handle,
+			tensor, axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
 
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' as FOBOS algorithm with fixed learning rate.
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
 //
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10716,35 +11256,35 @@ func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
+		Type: "ResourceApplyCenteredRMSProp",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// 2D fast Fourier transform.
+// Inverse 3D fast Fourier transform.
 //
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
 //
 // Arguments:
 //	input: A complex64 tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.fft2
+// Equivalent to np.fft.ifftn with 3 dimensions.
 // @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT2D",
+		Type: "IFFT3D",
 		Input: []tf.Input{
 			input,
 		},
@@ -10753,215 +11293,297 @@ func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a tensor filled with a scalar value.
-//
-// This operation creates a tensor of shape `dims` and fills it with `value`.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// For example:
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
 //
-// ```prettyprint
-// # Output tensor has shape [2, 3].
-// fill([2, 3], 9) ==> [[9, 9, 9]
-//                      [9, 9, 9]]
-// ```
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
 //
 // Arguments:
-//	dims: 1-D. Represents the shape of the output tensor.
-//	value: 0-D (scalar). Value to fill the returned tensor.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
 //
-// @compatibility(numpy)
-// Equivalent to np.full
-// @end_compatibility
-func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Fill",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			dims, value,
+			table_handle, keys, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D fast Fourier transform.
+// Given a quantized tensor described by (input, input_min, input_max), outputs a
 //
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
+// range that covers the actual values present in that tensor.  This op is
+// typically used to produce the requested_output_min and requested_output_max for
+// Requantize.
 //
 // Arguments:
-//	input: A complex64 tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
+		Type: "RequantizationRange",
 		Input: []tf.Input{
-			input,
+			input, input_min, input_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-//
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// Computes the gradients of depthwise convolution with respect to the input.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "DepthwiseConv2dNativeBackpropInput",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+// Computes gradients for the exponential linear (Elu) operation.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+//
+// Arguments:
+//
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucket",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
-//
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["exclusive"] = value
 	}
-}
-
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["reverse"] = value
 	}
 }
 
-// An array of Tensors of given size, with data written via Write and read
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// via Read or Pack.
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+// ```prettyprint
+// tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
+// ```
 //
-// Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+// ```prettyprint
+// tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
+// ```
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+// ```prettyprint
+// tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
+// ```
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+// ```prettyprint
+// tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
+// ```
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			size,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
 
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// Update '*var' according to the Adam algorithm.
+//
+// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10970,49 +11592,66 @@ func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "ResourceApplyAdam",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
+// 3D fast Fourier transform.
 //
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT3D",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
+
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_type"] = value
 	}
 }
 
-// Draws samples from a multinomial distribution.
+// Returns the size of a tensor.
 //
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+// This operation returns an integer representing the number of elements in
+// `input`.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// For example:
+//
+// ```
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11021,9 +11660,9 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "Size",
 		Input: []tf.Input{
-			logits, num_samples,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -11031,35 +11670,46 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11068,95 +11718,66 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the proximal adagrad scheme.
+// Get the value of the tensor specified by its handle.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
 //
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "GetSessionTensor",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			handle,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
 
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes gradient of the FractionalMaxPool function.
+// Update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11165,229 +11786,205 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			var_, alpha, l1, l2, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
-
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+// 2D fast Fourier transform.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of average pooling function.
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
 //
 // Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	input: A complex64 tensor.
 //
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
+		Type: "FFT2D",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// Creates a tensor filled with a scalar value.
+//
+// This operation creates a tensor of shape `dims` and fills it with `value`.
+//
+// For example:
+//
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
+//
+// Arguments:
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fill",
+		Input: []tf.Input{
+			dims, value,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+// Inverse 2D fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
 // Arguments:
+//	input: A complex64 tensor.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "IFFT2D",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
 
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
 //
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["header_bytes"] = value
+		m["element_shape"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
 //
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["footer_bytes"] = value
+		m["dynamic_size"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
 //
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["hop_bytes"] = value
+		m["clear_after_read"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
 // If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["tensor_array_name"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// An array of Tensors of given size.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
+// Write data via Write and read via Read or Pack.
 //
 // Arguments:
-//	record_bytes: Number of bytes in the record.
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
 //
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
+		Type: "TensorArrayV3",
+		Input: []tf.Input{
+			size,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
 
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
 //
 // Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11396,97 +11993,96 @@ func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			var_, alpha, delta,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
 //
 // Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "Multinomial",
 		Input: []tf.Input{
-			data, segment_ids,
+			logits, num_samples,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
 
-// CumprodReverse sets the optional reverse attribute to value.
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-// ```prettyprint
-// tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-// ```prettyprint
-// tf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]
-// ```
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-// ```prettyprint
-// tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
-// ```
-// This is more efficient than using separate `tf.reverse` ops.
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
-// ```prettyprint
-// tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]
-// ```
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11495,303 +12091,278 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			x, axis,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
-		},
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-//
-// This operation folds the padded areas of `input` by `MirrorPad` according to the
-// `paddings` you specify. `paddings` must be the same as `paddings` argument
-// given to the corresponding `MirrorPad` op.
-//
-// The folded size of each dimension D of the output is:
-//
-// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
-//
-// For example:
-//
-// ```prettyprint
-// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-// # 'paddings' is [[0, 1]], [0, 1]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[ 1,  5]
-//                       [11, 28]]
-// ```
+// Update '*var' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	input: The input tensor to be folded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: The mode used in the `MirrorPad` op.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns The folded tensor.
-func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MirrorPadGrad",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			input, paddings,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// The values must include 0. There can be no duplicate values or negative values.
+// `index  0  1  2  3  4`
 //
-// For example:
+// `value  20 5  16 3  7`
 //
-// ```prettyprint
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
 //
 // Arguments:
-//	x: 1-D.
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
+		Type: "FractionalMaxPoolGrad",
 		Input: []tf.Input{
-			x,
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```prettyprint
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
 //
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Reverse",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			tensor, dims,
+			features, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
 
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+//
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["header_bytes"] = value
 	}
 }
 
-// Conv2DDataFormat sets the optional data_format attribute to value.
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["footer_bytes"] = value
 	}
 }
 
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
 //
-// In detail, with the default NHWC format,
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
 //
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
 //
 // Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-//   `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
+//	record_bytes: Number of bytes in the record.
 //
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2D",
-		Input: []tf.Input{
-			input, filter,
-		},
+		Type: "FixedLengthRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// StringJoinSeparator sets the optional separator attribute to value.
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Joins the strings in the given list of string tensors into one tensor;
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// with the given separator (default is an empty separator).
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11800,431 +12371,402 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// Computes the mean along segments of a tensor.
 //
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExample",
-		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// Compute the pairwise cross product.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
 //
 // Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
 //
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cross",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			a, b,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumprodReverse sets the optional reverse attribute to value.
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+// ```prettyprint
+// tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
+// ```
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+// ```prettyprint
+// tf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]
+// ```
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+// ```prettyprint
+// tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
+// ```
+// This is more efficient than using separate `tf.reverse` ops.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// The `reverse` and `exclusive` kwargs can also be combined:
+// ```prettyprint
+// tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]
+// ```
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
+		Type: "Cumprod",
 		Input: []tf.Input{
-			input, fft_length,
+			x, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics where
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
 //
-// true, this follows C semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
 //
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncateMod",
+		Type: "SparseAdd",
 		Input: []tf.Input{
-			x, y,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
-
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the adagrad scheme.
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
+//
+// The folded size of each dimension D of the output is:
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
+// ```
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
 //
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "MirrorPadGrad",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			input, paddings,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
-
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// Computes the inverse permutation of a tensor.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a SparseTensor.
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// The values must include 0. There can be no duplicate values or negative values.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// For example:
+//
+// ```
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// ```
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	x: 1-D.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "InvertPermutation",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// Reverses specific dimensions of a tensor.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of max pooling function.
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "Reverse",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			tensor, dims,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// Conv2DDataFormat sets the optional data_format attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// In detail, with the default NHWC format,
 //
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+//   `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
+//
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "Conv2D",
 		Input: []tf.Input{
-			true_classes,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["separator"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// with the given separator (default is an empty separator).
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12233,292 +12775,170 @@ func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
-		Input: []tf.Input{
-			value,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
-//	input: Base64 strings to decode.
-//
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			input,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
-//
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
-	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
-//
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
 	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
-//
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
 	}
-}
-
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
-//
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
 	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// Compute the pairwise cross product.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.image_summary('images_with_box', image_with_box)
+// Inverse 2D real-valued fast Fourier transform.
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x > y) element-wise.
+// Returns element-wise remainder of division. This emulates C semantics where
 //
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// true, this follows C semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+//
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Greater",
+		Type: "TruncateMod",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -12527,47 +12947,34 @@ func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
+// value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// Update '*var' according to the adagrad scheme.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
+//	accum: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12576,153 +12983,110 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			var_, accum, lr, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// ```
-// number_of_steps = 1 << (# of bits in T)
-// range_adjust = number_of_steps / (number_of_steps - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = number_of_steps / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
 //
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the filter.
+// Computes gradients of max pooling function.
 //
 // Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -12730,62 +13094,112 @@ func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_s
 	return op.Output(0)
 }
 
-// Returns which elements of x are Inf.
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			x,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
+//	lr: Learning rate. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
 //	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12794,102 +13208,115 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "ResourceSparseApplyProximalAdagrad",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			var_, accum, lr, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
+// Store the input tensor in the state of the current session.
 //
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+// Arguments:
+//	value: The tensor to be stored.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode web-safe base64-encoded strings.
+//
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	input: Base64 strings to decode.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
+		Type: "DecodeBase64",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
-
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
-//
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
-	return func(m optionalAttr) {
-		m["iou_threshold"] = value
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of average pooling function.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "AvgPool3DGrad",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -12897,38 +13324,45 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["skip_header_lines"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
+// TextLineReaderV2Container sets the optional container attribute to value.
 //
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12937,86 +13371,149 @@ func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
-		},
+		Type: "TextLineReaderV2",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ReadFile",
-		Input: []tf.Input{
-			filename,
-		},
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["use_image_if_no_bounding_boxes"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
+// Generate a single randomly distorted bounding box for an image.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.image_summary('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13025,104 +13522,75 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			shape, seed,
+			image_size, bounding_boxes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// PrintFirstN sets the optional first_n attribute to value.
-//
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
-	}
-}
-
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
-//
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
+// Returns the truth value of (x > y) element-wise.
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Print",
+		Type: "Greater",
 		Input: []tf.Input{
-			input, tf.OutputList(data),
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Resize `images` to `size` using area interpolation.
+// Update '*var' according to the RMSProp algorithm.
 //
-// Input images can be of different types but output images are always float.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13131,155 +13599,153 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			images, size,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["mode"] = value
 	}
 }
 
-// Returns the real part of a complex number.
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
 //
-// For example:
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
 // ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Real",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the next record (key, value pair) produced by a Reader.
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// number_of_steps = 1 << (# of bits in T)
+// range_adjust = number_of_steps / (number_of_steps - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = number_of_steps / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
+			input, min_range, max_range,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// LRNBeta sets the optional beta attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["data_format"] = value
 	}
 }
 
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
-//
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			input,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -13287,36 +13753,62 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
+// Returns which elements of x are Inf.
+//
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsInf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
+//	linear: Should be from a Variable().
 //	grad: The gradient.
 //	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13325,67 +13817,91 @@ func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, l
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNGradBias sets the optional bias attribute to value.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNGradAlpha sets the optional alpha attribute to value.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseMul",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// LRNGradBeta sets the optional beta attribute to value.
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// value: An exponent.
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
 // If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["iou_threshold"] = value
 	}
 }
 
-// Gradients for Local Response Normalization.
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13394,9 +13910,9 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			boxes, scores, max_output_size,
 		},
 		Attrs: attrs,
 	}
@@ -13404,26 +13920,38 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 	return op.Output(0)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// StringToNumberOutType sets the optional out_type attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// Update '*var' according to the adadelta scheme.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13432,178 +13960,202 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			string_tensor,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalNot",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "Fact",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// The padded size of each dimension D of the output is:
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// The outputs are a deterministic function of `shape` and `seed`.
 //
-// For example:
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// ```prettyprint
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			input, paddings,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given queue.
-//
-// Arguments:
-//	handle: The handle to a queue.
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
-		Input: []tf.Input{
-			handle,
-		},
+}
+
+// PrintFirstN sets the optional first_n attribute to value.
+//
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["first_n"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// PrintSummarize sets the optional summarize attribute to value.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// Passes `input` through to `output` and prints `data` when evaluating.
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "Print",
 		Input: []tf.Input{
-			tag, values,
+			input, tf.OutputList(data),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
-//
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
-	}
-}
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
 
-// AsStringScientific sets the optional scientific attribute to value.
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Use scientific notation for floating point numbers.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
 // If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["scientific"] = value
+		m["align_corners"] = value
 	}
 }
 
-// AsStringShortest sets the optional shortest attribute to value.
+// Resize `images` to `size` using area interpolation.
 //
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["shortest"] = value
-	}
-}
-
-// AsStringWidth sets the optional width attribute to value.
+// Input images can be of different types but output images are always float.
 //
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["width"] = value
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeArea",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AsStringFill sets the optional fill attribute to value.
-//
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
+
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["fill"] = value
+		m["Tout"] = value
 	}
 }
 
-// Converts each entry in the given tensor to strings.  Supports many numeric
+// Returns the real part of a complex number.
 //
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13612,7 +14164,7 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AsString",
+		Type: "Real",
 		Input: []tf.Input{
 			input,
 		},
@@ -13622,95 +14174,123 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "ReaderReadV2",
 		Input: []tf.Input{
-			predictions, targets,
+			reader_handle, queue_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
 
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["container"] = value
 	}
 }
 
-// Gather slices from `params` according to `indices`.
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
+	}
+}
+
+// Creates an empty hash table that uses tensors as the backing store.
 //
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
 //
-// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-// `indices` are always validated to be within range. If assigned to GPU,
-// out-of-bound indices result in safe but unspecified behavior, which may include
-// raising an error.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../../images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
+// Arguments:
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Gather",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			params, indices,
+			empty_key,
 		},
 		Attrs: attrs,
 	}
@@ -13718,184 +14298,221 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 	return op.Output(0)
 }
 
-// Adjust the contrast of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
-//
-// Contrast is adjusted independently for each channel of each image.
-//
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
-		Input: []tf.Input{
-			images, contrast_factor,
-		},
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes softsign gradients for a softsign operation.
+// LRNBias sets the optional bias attribute to value.
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
 //
-// Returns The gradients: `gradients / (1 + abs(-features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
+// Local Response Normalization.
 //
-// The polygamma function is defined as:
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
 //
-// ```
-// \psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)
-// ```
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "LRN",
 		Input: []tf.Input{
-			a, x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
-//
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
-//
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2D",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			input, filter,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
+}
+
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
+
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
 //
-// convert $src.gif -coalesce $dst.gif
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			contents,
+			input_grads, input_image, output_image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
 
-// EncodeBase64Pad sets the optional pad attribute to value.
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
 	return func(m optionalAttr) {
-		m["pad"] = value
+		m["out_type"] = value
 	}
 }
 
-// Encode strings into web-safe base64 format.
-//
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// Converts each string in the input Tensor to the specified numeric type.
 //
-// Arguments:
-//	input: Strings to be encoded.
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13904,9 +14521,9 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			input,
+			string_tensor,
 		},
 		Attrs: attrs,
 	}
@@ -13914,148 +14531,168 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "LogicalNot",
 		Input: []tf.Input{
-			reader_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// If `pos` is negative or specifies a character index larger than any of the input
-// strings, then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
+// Pads a tensor with zeros.
 //
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// output = [b'ell', b'orl']
-// ```
+// The padded size of each dimension D of the output is:
 //
-// Using `pos` and `len` with same shape as `input`:
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
+// For example:
 //
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
 // ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
 // ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the number of elements in the given queue.
 //
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
+// Arguments:
+//	handle: The handle to a queue.
 //
-// Broadcasting `input` onto `pos` and `len`:
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueSizeV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
 //
-// output = [b'hir', b'ee', b'n"]
-// ```
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			input, pos, len,
+			tag, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
+
+// AsStringPrecision sets the optional precision attribute to value.
+//
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
+	}
+}
+
+// AsStringScientific sets the optional scientific attribute to value.
+//
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["scientific"] = value
+	}
+}
+
+// AsStringShortest sets the optional shortest attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["shortest"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
+// AsStringWidth sets the optional width attribute to value.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["width"] = value
+	}
+}
+
+// AsStringFill sets the optional fill attribute to value.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
+	return func(m optionalAttr) {
+		m["fill"] = value
+	}
+}
+
+// Converts each entry in the given tensor to strings.  Supports many numeric
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14064,9 +14701,9 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "AsString",
 		Input: []tf.Input{
-			shape, seed,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -14074,154 +14711,151 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
+
+// GatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func GatherValidateIndices(value bool) GatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// Gather slices from `params` according to `indices`.
 //
-// N is the size of the segment being reduced.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Arguments:
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
+//
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "Gather",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			params, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Computes softsign gradients for a softsign operation.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns The gradients: `gradients / (1 + abs(-features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "SoftsignGrad",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reads the value of a variable.
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// The tensor returned by this operation is immutable.
+// The polygamma function is defined as:
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
+		Type: "Polygamma",
 		Input: []tf.Input{
-			resource,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the product of elements across dimensions of a tensor.
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) {
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			input, reduction_indices,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -14229,33 +14863,33 @@ func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
 
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: Bool whether padding is applied at the ends.
 // If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["pad"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
+// Encode strings into web-safe base64 format.
 //
-// Input images can be of different types but output images are always float.
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14264,9 +14898,9 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
-			images, size,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -14274,97 +14908,148 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// Computes the absolute value of a tensor.
+// Produce a string tensor that encodes the state of a Reader.
 //
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Abs",
+		Type: "ReaderSerializeStateV2",
 		Input: []tf.Input{
-			x,
+			reader_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a reader to a previously saved state.
+// Return substrings from `Tensor` of strings.
 //
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// If `pos` is negative or specifies a character index larger than any of the input
+// strings, then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n"]
+// ```
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
 //
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
+		Type: "Substr",
 		Input: []tf.Input{
-			reader_handle, state,
+			input, pos, len,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dtype"] = value
 	}
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// The generated values will have mean 0 and standard deviation 1.
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
-// rate.
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14373,9 +15058,9 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			shape, rate,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -14383,139 +15068,88 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
+// Inverse fast Fourier transform.
 //
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
 // Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	input: A complex64 tensor.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "IFFT",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// N is the size of the segment being reduced.
 //
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
 //
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+		m["align_corners"] = value
 	}
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
-//
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
-//
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// Computes the gradient of bilinear interpolation.
 //
 // Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14524,267 +15158,182 @@ func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
-
-// UnpackAxis sets the optional axis attribute to value.
+// Computes the number of elements in the given table.
 //
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
+// Arguments:
+//	table_handle: Handle to the table.
+//
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableSizeV2",
+		Input: []tf.Input{
+			table_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// This is the opposite of `pack`.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			value,
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
+// Reads the value of a variable.
 //
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// The tensor returned by this operation is immutable.
 //
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "ReadVariableOp",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			resource,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
+	return op.Output(0)
 }
 
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
+// Computes the absolute value of a tensor.
 //
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
+	opspec := tf.OpSpec{
+		Type: "Abs",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.
-//
-// For example:
+// Restore a reader to a previously saved state.
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-// tf.reduce_join(a, []) ==> ["abcd"]
-// ```
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
 //
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "ReaderRestoreStateV2",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			reader_handle, state,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
 
-// SvdComputeUv sets the optional compute_uv attribute to value.
+// RandomPoissonSeed sets the optional seed attribute to value.
 //
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["compute_uv"] = value
+		m["seed"] = value
 	}
 }
 
-// SvdFullMatrices sets the optional full_matrices attribute to value.
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
 //
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["seed2"] = value
 	}
 }
 
-// Computes the singular value decompositions of one or more matrices.
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
 //
-// ```prettyprint
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
 //
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
+// rate.
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14793,104 +15342,174 @@ func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Svd",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			input,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+//
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
+//
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
+//
+// Arguments:
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmax",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Asserts that the given condition is true.
+// Computes gradients for SparseSegmentMean.
 //
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
-//
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "SparseSegmentMeanGrad",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			grad, indices, segment_ids, output_dim0,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
+// Converts one or more images from RGB to HSV.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+//
+// Arguments:
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RGBToHSV",
+		Input: []tf.Input{
+			images,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["fast"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
+// Solves one or more linear least-squares problems.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
+// in the least squares sense.
+//
+// matrix and right-hand sides in the batch:
+//
+// `matrix`=\\(A \in \Re^{m \times n}\\),
+// `rhs`=\\(B  \in \Re^{m \times k}\\),
+// `output`=\\(X  \in \Re^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||Z||_F^2 \\), subject to
+// \\(A Z = B\\). Notice that the fast path is only numerically stable when
+// \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			shape,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
@@ -14898,41 +15517,27 @@ func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ..
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
 
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_type"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14941,41 +15546,47 @@ func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
 
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["delete_old_dirs"] = value
 	}
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14984,310 +15595,267 @@ func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			input, reduction_indices,
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
+
+// UnpackAxis sets the optional axis attribute to value.
 //
-// ```
-// \zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}
-// ```
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Zeta",
-		Input: []tf.Input{
-			x, q,
-		},
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
 //
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
 //
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num": num}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "Unpack",
 		Input: []tf.Input{
-			input, fft_length,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Saves tensors in V2 checkpoint format.
-//
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
-//
-// Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "SaveV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
-
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
-//
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["lower"] = value
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
 	}
+	return output
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.triangular_solve
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations with upper or lower triangular matrices by
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// backsubstitution.
+// Graphically the output tensors are:
 //
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
-// Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			matrix, rhs,
+			split_dim, indices, values, shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
 }
 
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
+
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
+}
+
+// ReduceJoinSeparator sets the optional separator attribute to value.
+//
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Real-valued fast Fourier transform.
+// Joins a string Tensor across the given dimensions.
 //
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
+// For example:
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+// tf.reduce_join(a, []) ==> ["abcd"]
+// ```
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
+// Arguments:
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "RFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the tanh of `x` wrt its input.
-//
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TanhGrad",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			x, y,
+			inputs, reduction_indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// SvdComputeUv sets the optional compute_uv attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["compute_uv"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// SvdFullMatrices sets the optional full_matrices attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// Computes the singular value decompositions of one or more matrices.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// ```prettyprint
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15296,128 +15864,146 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "Svd",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
 //
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Asserts that the given condition is true.
+//
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "Assert",
 		Input: []tf.Input{
-			input,
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
 
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["seed"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
 //
-// For example:
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
 //
-// ```prettyprint
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
 // Arguments:
-//	x: 1-D.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			x,
+			shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
 
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// The input tensors `real` and `imag` must have the same shape.
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
-// For example:
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15426,41 +16012,41 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "ResourceApplyFtrl",
 		Input: []tf.Input{
-			real, imag,
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
 
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
+// AnyKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Returns the imaginary part of a complex number.
+// Computes the "logical or" of elements across dimensions of a tensor.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
-// For example:
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15469,9 +16055,9 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "Any",
 		Input: []tf.Input{
-			input,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -15479,156 +16065,214 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Zeta",
+		Input: []tf.Input{
+			x, q,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
 // Arguments:
-//	input: vector of strings to compute fingerprints on.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			input,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
+// Saves tensors in V2 checkpoint format.
 //
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "SaveV2",
 		Input: []tf.Input{
-			reader_handle,
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
+}
+
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
+//
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
+	}
 }
 
-// Computes exponential of x - 1 element-wise.
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Expm1",
-		Input: []tf.Input{
-			x,
-		},
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.triangular_solve
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x - y element-wise.
+// Solves systems of linear equations with upper or lower triangular matrices by
 //
-// *NOTE*: `Sub` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// backsubstitution.
+//
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			x, y,
+			matrix, rhs,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
+// Adds a value to the current value of a variable.
 //
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "AssignAddVariableOp",
 		Input: []tf.Input{
-			input, delimiter,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
+// Real-valued fast Fourier transform.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
 //
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
 //
 // @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
+// Equivalent to np.fft.rfft
 // @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "RFFT",
 		Input: []tf.Input{
 			input, fft_length,
 		},
@@ -15637,16 +16281,16 @@ func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Out
 	return op.Output(0)
 }
 
-// Returns the truth value of (x != y) element-wise.
+// Computes the gradient for the tanh of `x` wrt its input.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "TanhGrad",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -15655,93 +16299,100 @@ func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
+// Outputs all keys and values in the table.
 //
-// ```prettyprint
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
+// Arguments:
+//	table_handle: Handle to the table.
 //
-// which has shape (2, 4, 4)
-// ```
 //
-// Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
 //
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
+		Type: "LookupTableExportV2",
 		Input: []tf.Input{
-			diagonal,
+			table_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+//
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
+//
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			input,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -15749,253 +16400,230 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Returns x // y element-wise.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "FloorDiv",
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
 
-// TopKSorted sets the optional sorted attribute to value.
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
+// Gather specific elements from the TensorArray into output `value`.
 //
-// If `k` varies dynamically, use `TopKV2` below.
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			input,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
+// Deprecated. Disallowed in GraphDef version >= 2.
+//
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// TopKV2Sorted sets the optional sorted attribute to value.
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["data_format"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopKV2",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			input, k,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
-
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+// 3D real-valued fast Fourier transform.
 //
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
 //
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
 //
-// `index  0  1  2  3  4`
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// `value  20 5  16 3  7`
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
+	opspec := tf.OpSpec{
+		Type: "RFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
 
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_idx"] = value
 	}
 }
-
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
+
+// Finds unique elements in a 1-D tensor.
 //
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
 //
-// Then, row_pooling_sequence should satisfy:
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
+// For example:
 //
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+//	x: 1-D.
 //
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "UniqueWithCounts",
 		Input: []tf.Input{
-			value,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -16003,48 +16631,34 @@ func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, o
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
-
-// RandomCropSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
 
-// RandomCropSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["Tout"] = value
 	}
 }
 
-// Randomly crop `image`.
-//
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+// Converts two real numbers to a complex number.
 //
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
+// The input tensors `real` and `imag` must have the same shape.
 //
-// Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+// For example:
 //
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16053,9 +16667,9 @@ func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...Rando
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomCrop",
+		Type: "Complex",
 		Input: []tf.Input{
-			image, size,
+			real, imag,
 		},
 		Attrs: attrs,
 	}
@@ -16063,338 +16677,351 @@ func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...Rando
 	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
 
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+		m["Tout"] = value
 	}
 }
 
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// Returns the imaginary part of a complex number.
 //
-// `index  0  1  2  3  4`
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// `value  20 5  16 3  7`
+// For example:
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+// Computes fingerprints of the input strings.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
+// Returns the number of records this Reader has produced.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumRecordsProducedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+// Computes exponential of x - 1 element-wise.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+// Returns x - y element-wise.
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// *NOTE*: `Sub` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
+		Type: "Sub",
 		Input: []tf.Input{
-			value,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Produces the average pool of the input tensor for quantized types.
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
+		Type: "StringSplit",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			input, delimiter,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+// Inverse 3D real-valued fast Fourier transform.
 //
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
 //
-// Arguments:
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
+		Type: "IRFFT3D",
 		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
+	return op.Output(0)
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
-//
-// Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
+// Returns the truth value of (x != y) element-wise.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "NotEqual",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Quantized Batch normalization.
+// Says whether the targets are in the top `K` predictions.
 //
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Type: "InTopK",
 		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+			predictions, targets,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Add all input tensors element wise.
+// Returns a batched diagonal tensor with a given batched diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
 //
 // Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AddN",
+		Type: "MatrixDiag",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			diagonal,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
 
-// MaxKeepDims sets the optional keep_dims attribute to value.
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the maximum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MaxAttr) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Max",
-		Input: []tf.Input{
-			input, reduction_indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"DstT": DstT}
-	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -16402,16 +17029,16 @@ func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the truth value of x AND y element-wise.
+// Returns x // y element-wise.
 //
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "FloorDiv",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -16420,622 +17047,648 @@ func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
 
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+// TopKSorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["sorted"] = value
 	}
 }
 
-// Computes the complex absolute value of a tensor.
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"k": k}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "TopK",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Draw bounding boxes on a batch of images.
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
+
+// TopKV2Sorted sets the optional sorted attribute to value.
 //
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// For example, if an image is 100 x 200 pixels and the bounding box is
-// `[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
-// bounding box will be `(10, 40)` to `(50, 180)`.
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
 //
-// Parts of the bounding box may fall outside the image.
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
+		Type: "TopKV2",
 		Input: []tf.Input{
-			images, boxes,
+			input, k,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns the element-wise max of two SparseTensors.
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
+
+// RandomCropSeed sets the optional seed attribute to value.
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomCropSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly crop `image`.
+//
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "RandomCrop",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			image, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
+
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// DEPRECATED at GraphDef version 17: Use ReciprocalGrad
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "InvGrad",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reciprocal",
-		Input: []tf.Input{
-			x,
-		},
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
 //
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Elu",
-		Input: []tf.Input{
-			features,
-		},
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes square of x element-wise.
+// Performs fractional average pooling on the input.
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Square",
+		Type: "FractionalAvgPool",
 		Input: []tf.Input{
-			x,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+// Updates the table to associates keys with values.
 //
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorMod",
+		Type: "LookupTableInsertV2",
 		Input: []tf.Input{
-			x, y,
+			table_handle, keys, values,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes square root of x element-wise.
+// Produces the average pool of the input tensor for quantized types.
 //
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Sqrt",
+		Type: "QuantizedAvgPool",
 		Input: []tf.Input{
-			x,
+			input, min_input, max_input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Inserts a dimension of 1 into a tensor's shape.
-//
-// Given a tensor `input`, this operation inserts a dimension of 1 at the
-// dimension index `dim` of `input`'s shape. The dimension index `dim` starts at
-// zero; if you specify a negative number for `dim` it is counted backward from
-// the end.
-//
-// This operation is useful if you want to add a batch dimension to a single
-// element. For example, if you have a single image of shape `[height, width,
-// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-// which will make the shape `[1, height, width, channels]`.
-//
-// Other examples:
-//
-// ```prettyprint
-// # 't' is a tensor of shape [2]
-// shape(expand_dims(t, 0)) ==> [1, 2]
-// shape(expand_dims(t, 1)) ==> [2, 1]
-// shape(expand_dims(t, -1)) ==> [2, 1]
-//
-// # 't2' is a tensor of shape [2, 3, 5]
-// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-// ```
-//
-// This operation requires that:
-//
-// `-1-input.dims() <= dim <= input.dims()`
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
-// This operation is related to `squeeze()`, which removes dimensions of
-// size 1.
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
 //
 // Arguments:
 //
-//	dim: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`.
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
 //
-// Returns Contains the same data as `input`, but its shape has an additional
-// dimension of size 1 added.
-func ExpandDims(scope *Scope, input tf.Output, dim tf.Output) (output tf.Output) {
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ExpandDims",
+		Type: "QuantizedBiasAdd",
 		Input: []tf.Input{
-			input, dim,
+			input, bias, min_input, max_input, min_bias, max_bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
 
-// AllKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["out_type"] = value
 	}
 }
 
-// Computes the "logical and" of elements across dimensions of a tensor.
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
 //
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AllAttr) (output tf.Output) {
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			input, reduction_indices,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
-
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If true, merge repeated classes in output.
-// If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Performs beam search decoding on the logits given in input.
+// Quantized Batch normalization.
 //
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
 //
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
+		Type: "QuantizedBatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	log_probability = op.Output(idx)
-	return decoded_indices, decoded_values, decoded_shape, log_probability
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes reciprocal of square root of x element-wise.
+// Add all input tensors element wise.
 //
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rsqrt",
+		Type: "AddN",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RecordInputAttr is an optional argument to RecordInput.
-type RecordInputAttr func(optionalAttr)
-
-// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
-//
-// value: Random seeds used to produce randomized records.
-// If not specified, defaults to 301
-func RecordInputFileRandomSeed(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_random_seed"] = value
-	}
-}
-
-// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
-//
-// value: Shifts the list of files after the list is randomly
-// shuffled.
-// If not specified, defaults to 0
-func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_shuffle_shift_ratio"] = value
-	}
-}
-
-// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
-//
-// value: The randomization shuffling buffer.
-// If not specified, defaults to 10000
-func RecordInputFileBufferSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_buffer_size"] = value
-	}
-}
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
 
-// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: How many sstables are opened and concurrently iterated over.
-// If not specified, defaults to 16
-func RecordInputFileParallelism(value int64) RecordInputAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
-		m["file_parallelism"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// RecordInputBatchSize sets the optional batch_size attribute to value.
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// value: The batch size.
-// If not specified, defaults to 32
-func RecordInputBatchSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["batch_size"] = value
-	}
-}
-
-// Emits randomized records.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	file_pattern: Glob pattern for the data files.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns A tensor of shape [batch_size].
-func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RecordInput",
-
+		Type: "Max",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "Cast",
 		Input: []tf.Input{
 			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: First entry in the range.
-//	stop: Last entry in the range.
-//	num: Number of values to generate.
+// Returns the truth value of x AND y element-wise.
 //
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "LogicalAnd",
 		Input: []tf.Input{
-			start, stop, num,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes natural logarithm of x element-wise.
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
+
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Computes the complex absolute value of a tensor.
 //
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Log",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
 			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation.
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "SparseSparseMaximum",
 		Input: []tf.Input{
-			gradients, features,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
-
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Resize `images` to `size` using bicubic interpolation.
-//
-// Input images can be of different types but output images are always float.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// DEPRECATED at GraphDef version 17: Use ReciprocalGrad
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			images, size,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
+// Computes the reciprocal of x element-wise.
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "Reciprocal",
 		Input: []tf.Input{
 			x,
 		},
@@ -17044,28 +17697,33 @@ func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+//
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Lgamma",
+		Type: "Elu",
 		Input: []tf.Input{
-			x,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes square of x element-wise.
+//
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Floor",
+		Type: "Square",
 		Input: []tf.Input{
 			x,
 		},
@@ -17074,28 +17732,36 @@ func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+//
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+//
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Erf",
+		Type: "FloorMod",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes square root of x element-wise.
+//
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Erfc",
+		Type: "Sqrt",
 		Input: []tf.Input{
 			x,
 		},
@@ -17104,224 +17770,280 @@ func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// 2D real-valued fast Fourier transform.
+// Inserts a dimension of 1 into a tensor's shape.
 //
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
+// Given a tensor `input`, this operation inserts a dimension of 1 at the
+// dimension index `dim` of `input`'s shape. The dimension index `dim` starts at
+// zero; if you specify a negative number for `dim` it is counted backward from
+// the end.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// This operation is useful if you want to add a batch dimension to a single
+// element. For example, if you have a single image of shape `[height, width,
+// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+// which will make the shape `[1, height, width, channels]`.
+//
+// Other examples:
+//
+// ```
+// # 't' is a tensor of shape [2]
+// shape(expand_dims(t, 0)) ==> [1, 2]
+// shape(expand_dims(t, 1)) ==> [2, 1]
+// shape(expand_dims(t, -1)) ==> [2, 1]
+//
+// # 't2' is a tensor of shape [2, 3, 5]
+// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+// ```
+//
+// This operation requires that:
+//
+// `-1-input.dims() <= dim <= input.dims()`
+//
+// This operation is related to `squeeze()`, which removes dimensions of
+// size 1.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	dim: 0-D (scalar). Specifies the dimension index at which to
+// expand the shape of `input`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Contains the same data as `input`, but its shape has an additional
+// dimension of size 1 added.
+func ExpandDims(scope *Scope, input tf.Output, dim tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "ExpandDims",
 		Input: []tf.Input{
-			input, fft_length,
+			input, dim,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gradients for batch normalization.
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
+
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical and" of elements across dimensions of a tensor.
 //
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "All",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
 
-// EncodeJpegFormat sets the optional format attribute to value.
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
 	return func(m optionalAttr) {
-		m["format"] = value
+		m["merge_repeated"] = value
 	}
 }
 
-// EncodeJpegQuality sets the optional quality attribute to value.
+// Performs beam search decoding on the logits given in input.
 //
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCBeamSearchDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
 	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
 }
 
-// EncodeJpegProgressive sets the optional progressive attribute to value.
+// Computes reciprocal of square root of x element-wise.
 //
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rsqrt",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
-//
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
 
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
+		m["file_random_seed"] = value
 	}
 }
 
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+// value: Shifts the list of files after the list is randomly
+// shuffled.
+// If not specified, defaults to 0
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["density_unit"] = value
+		m["file_shuffle_shift_ratio"] = value
 	}
 }
 
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["x_density"] = value
+		m["file_buffer_size"] = value
 	}
 }
 
-// EncodeJpegYDensity sets the optional y_density attribute to value.
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["y_density"] = value
+		m["file_parallelism"] = value
 	}
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// RecordInputBatchSize sets the optional batch_size attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["batch_size"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
-//
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// Emits randomized records.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	file_pattern: Glob pattern for the data files.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
-		Input: []tf.Input{
-			image,
-		},
+		Type: "RecordInput",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
+// Rounds the values of a tensor to the nearest integer, element-wise.
+//
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sin",
+		Type: "Round",
 		Input: []tf.Input{
 			x,
 		},
@@ -17330,212 +18052,82 @@ func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the determinant of one ore more square matrices.
+// Generates values in an interval.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			x,
+			start, stop, num,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
-//
-// Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```prettyprint
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```prettyprint
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-//
-// ```prettyprint
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```prettyprint
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-//
-// ```prettyprint
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```prettyprint
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-//
-// ```prettyprint
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Computes natural logarithm of x element-wise.
 //
-// ```prettyprint
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
+		Type: "Log",
 		Input: []tf.Input{
-			input, crops,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```prettyprint
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
+// Resize `images` to `size` using bicubic interpolation.
 //
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17544,9 +18136,9 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "ResizeBicubic",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -17554,146 +18146,102 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 	return op.Output(0)
 }
 
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asin",
+		Type: "Relu6Grad",
 		Input: []tf.Input{
-			x,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// output range specified with 'requested_output_min' and 'requested_output_max'.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+// Computes natural logarithm of (1 + x) element-wise.
 //
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Requantize",
+		Type: "Log1p",
 		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
-//
-// Arguments:
-//
-//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
-// of the input Tensor to reduce across. For vectors, use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "Lgamma",
 		Input: []tf.Input{
-			input, dimension,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "RealDiv",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
+		Type: "Floor",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+			x,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns which elements of x are NaN.
-//
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "Erf",
 		Input: []tf.Input{
 			x,
 		},
@@ -17702,17 +18250,13 @@ func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-//
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "Erfc",
 		Input: []tf.Input{
 			x,
 		},
@@ -17721,28 +18265,49 @@ func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Returns element-wise smallest integer in not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+// 2D real-valued fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Ceil",
+		Type: "RFFT2D",
 		Input: []tf.Input{
-			x,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exp",
+		Type: "Sin",
 		Input: []tf.Input{
 			x,
 		},
@@ -17751,106 +18316,158 @@ func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the Max along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum
-// such that:
-//
-// \\(output_i = \max_j data_j\\) where max is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
-//  `output[i] = numeric_limits<T>::min()`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// Arguments:
+// Computes the determinant of one ore more square matrices.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
 //
+// Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Add",
+		Type: "Cos",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
+// BatchToSpace for 4-D tensors of type T.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// This is a legacy version of the more general BatchToSpaceND.
+//
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+//
+//
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			value, bias,
+			input, crops,
 		},
 		Attrs: attrs,
 	}
@@ -17858,41 +18475,53 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 	return op.Output(0)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Converts a sparse representation into a dense tensor.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
+// Builds an array `dense` with shape `output_shape` such that
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// ```prettyprint
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17901,65 +18530,117 @@ func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns x * y element-wise.
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// *NOTE*: `Mul` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// output range specified with 'requested_output_min' and 'requested_output_max'.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Mul",
+		Type: "Requantize",
 		Input: []tf.Input{
-			x, y,
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns x / y element-wise.
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
+// of the input Tensor to reduce across. For vectors, use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Div",
+		Type: "ArgMin",
 		Input: []tf.Input{
-			x, y,
+			input, dimension,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
 
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["tolerance"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17968,315 +18649,258 @@ func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...Approx
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "ResourceSparseApplyProximalGradientDescent",
 		Input: []tf.Input{
-			x, y,
+			var_, alpha, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-//
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Maximum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
 
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["vocab_size"] = value
 	}
 }
 
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["delimiter"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// Initializes a table from a text file.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			true_classes,
+			table_handle, filename,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of (x < y) element-wise.
-//
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "Atan",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute gradients for a FakeQuantWithMinMaxVars operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-// min, max: Quantization interval, scalar floats.
-//
-//
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// Returns Backpropagated gradients w.r.t. inputs:
-// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
-// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
-// `sum(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsGradient",
-		Input: []tf.Input{
-			gradients, inputs, min, max,
-		},
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+// var: Should be from a Variable().
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			x, y,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// Returns which elements of x are NaN.
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "IsNan",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
+// Returns an element-wise indication of the sign of a number.
 //
-// ```
-// Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)
-// ```
-// where
-// ```
-// Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt
-// ```
-// is the upper incomplete Gama function.
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
 //
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "Sign",
 		Input: []tf.Input{
-			a, x,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-// ```
-// P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)
-// ```
-// where
-// ```
-// gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt
-// ```
-// is the lower incomplete Gamma function.
-//
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns element-wise smallest integer in not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igamma",
+		Type: "Ceil",
 		Input: []tf.Input{
-			a, x,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-// ```
-// I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}
-// ```
-// where
-//
-// ```
-// B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt
-// ```
-//
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "Exp",
 		Input: []tf.Input{
-			a, b, x,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
+// Computes the Max along segments of a tensor.
 //
-// For each batch `i` and class `j` we have
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum
+// such that:
+//
+// \\(output_i = \max_j data_j\\) where max is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
+//  `output[i] = numeric_limits<T>::min()`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
+		Type: "UnsortedSegmentMax",
 		Input: []tf.Input{
-			logits,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
+// Returns x + y element-wise.
 //
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "Add",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -18285,121 +18909,89 @@ func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalOr",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Selects elements from `t` or `e`, depending on `condition`.
-//
-// The `t`, and `e` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `t` and `e` are scalars.
-// If `t` and `e` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `t`, or must have
-// the same shape as `t`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `t` (if true) or `e` (if false).
-//
-// If `condition` is a vector and `t` and `e` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `t` and `e`.
-// If `condition` has the same shape as `t` and `e`, then it chooses which
-// element to copy from `t` and `e`.
-//
-// For example:
-//
-// ```prettyprint
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 6],
-//                              [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// Adds `bias` to `value`.
 //
-// ```
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-//	t: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `t` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	e: = A `Tensor` with the same type and shape as `t`.
-//
-// Returns = A `Tensor` with the same type and shape as `t` and `e`.
-func Select(scope *Scope, condition tf.Output, t tf.Output, e tf.Output) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			condition, t, e,
+			value, bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
 
-// MatMulTransposeA sets the optional transpose_a attribute to value.
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If true, "a" is transposed before multiplication.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18408,42 +19000,65 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "SparseReduceSumSparse",
 		Input: []tf.Input{
-			a, b,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
+// Returns x * y element-wise.
+//
+// *NOTE*: `Mul` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mul",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// MeanKeepDims sets the optional keep_dims attribute to value.
+// Returns x / y element-wise.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["tolerance"] = value
 	}
 }
 
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
-//
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MeanAttr) (output tf.Output) {
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18452,9 +19067,9 @@ func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mean",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			input, reduction_indices,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -18462,149 +19077,145 @@ func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 	return op.Output(0)
 }
 
-// Returns which elements of x are finite.
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "Maximum",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
+
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
-// of the input Tensor to reduce across. For vectors, use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "LogUniformCandidateSampler",
 		Input: []tf.Input{
-			input, dimension,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Returns the truth value of (x < y) element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "Less",
 		Input: []tf.Input{
-			data, segment_ids,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
 
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
 //
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
-		m["bad_color"] = value
+		m["data_format"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
+//	out_backprop: Any number of dimensions.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18613,9 +19224,9 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			tag, tensor,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -18623,339 +19234,338 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 	return op.Output(0)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
+// Computes the power of one value to another.
 //
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
 //
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
-// Reshaping does not affect the order of values in the SparseTensor.
+// The upper regularized incomplete Gamma function is defined as:
 //
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "Igammac",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
+// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// The lower regularized incomplete Gamma function is defined as:
 //
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
 //
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
+// where
 //
-// Arguments:
+// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// is the lower incomplete Gamma function.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "Igamma",
 		Input: []tf.Input{
-			data, segment_ids,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// `num_segments` should equal the number of distinct segment IDs.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "Atan2",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			y, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// For example:
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// ```prettyprint
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+// The regularized incomplete beta integral is defined as:
 //
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-//   ==> [[0 0 0 0]]
 //
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-//   ==> [[ 1  2  3  4]
-//        [-1 -2 -3 -4]]
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
 //
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-//   ==> [[0 0 0 0]
-//        [5 6 7 8]]
+// where
 //
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
 //
-// Arguments:
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "Betainc",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			a, b, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
+// Computes log softmax activations.
 //
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
+// For each batch `i` and class `j` we have
 //
-// Values in `arr` outside of the range [0, size) are ignored.
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
 //
 // Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "LogSoftmax",
 		Input: []tf.Input{
-			arr, size, weights,
+			logits,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
-//
-// ```
-//
-// Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
+// Returns the truth value of (x <= y) element-wise.
 //
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// Returns the truth value of x OR y element-wise.
 //
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "LogicalOr",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
+// Selects elements from `t` or `e`, depending on `condition`.
 //
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
+// The `t`, and `e` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `t` and `e` are scalars.
+// If `t` and `e` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `t`, or must have
+// the same shape as `t`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `t` (if true) or `e` (if false).
+//
+// If `condition` is a vector and `t` and `e` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `t` and `e`.
+// If `condition` has the same shape as `t` and `e`, then it chooses which
+// element to copy from `t` and `e`.
 //
 // For example:
 //
+// ```prettyprint
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 6],
+//                              [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
 // ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-// ```
 //
-// Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+// Arguments:
+//
+//	t: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `t` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	e: = A `Tensor` with the same type and shape as `t`.
+//
+// Returns = A `Tensor` with the same type and shape as `t` and `e`.
+func Select(scope *Scope, condition tf.Output, t tf.Output, e tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Select",
+		Input: []tf.Input{
+			condition, t, e,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
+//
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "MatMul",
 		Input: []tf.Input{
-			start, limit, delta,
+			a, b,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
+// Computes the mean of elements across dimensions of a tensor.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18964,134 +19574,172 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "Mean",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the complex conjugate of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
-//
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
-//
-// For example:
+// Returns which elements of x are finite.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Conj",
+		Type: "IsFinite",
 		Input: []tf.Input{
-			input,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
+// of the input Tensor to reduce across. For vectors, use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ArgMax",
+		Input: []tf.Input{
+			input, dimension,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
+// Computes the sum along segments of a tensor.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			shape, seed,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["max_images"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["bad_color"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with images.
 //
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
 //
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "ImageSummary",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
@@ -19099,39 +19747,79 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 	return op.Output(0)
 }
 
-// A placeholder op that passes through `input` when its output is not fed.
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
 //
 // Arguments:
-//	input: The default value to produce when `output` is not fed.
-//	shape: The (possibly partial) shape of the tensor.
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
 //
-// Returns A placeholder tensor that defaults to `input` if it is not fed.
-func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "PlaceholderWithDefault",
+		Type: "SparseReshape",
 		Input: []tf.Input{
-			input,
+			input_indices, input_shape, new_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Deprecated. Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
+//
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -19139,308 +19827,297 @@ func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
-
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// Computes the product along segments of a tensor.
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
 //
 // Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
+		Type: "SegmentProd",
 		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
+	return op.Output(0)
 }
 
-// Returns x * y element-wise, working on quantized buffers.
+// Computes the sum along segments of a tensor.
 //
-// Arguments:
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
 //
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// `num_segments` should equal the number of distinct segment IDs.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			data, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
-
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// Computes the sum along sparse segments of a tensor.
 //
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// For example:
+//
+// ```prettyprint
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+//   ==> [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+//   ==> [[ 1  2  3  4]
+//        [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+//   ==> [[0 0 0 0]
+//        [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSum",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Enqueues zero or more tuples of one or more tensors in the given queue.
-//
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
+// Counts the number of occurrences of each value in an integer array.
 //
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "Bincount",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			arr, size, weights,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Forwards the input to the output.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
+// ```
 //
 // Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
 //
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LoopCond",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			input,
+			tensor, shape, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Computes gradients for SparseSegmentSqrtN.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "SparseSegmentSqrtNGrad",
 		Input: []tf.Input{
-			x, y,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
+// Creates a sequence of numbers.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
 //
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
+// For example:
 //
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
 //
 // Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "Range",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			start, limit, delta,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
-// DecodePngChannels sets the optional channels attribute to value.
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["use_locking"] = value
 	}
 }
 
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// Accepted values are:
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// That is for rows we have grad for, we update var and accum as follows:
 //
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19449,52 +20126,71 @@ func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "ResourceSparseApplyMomentum",
 		Input: []tf.Input{
-			contents,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the complex conjugate of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
+//
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Conj",
+		Input: []tf.Input{
+			input,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
 
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["dtype"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19503,9 +20199,9 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
-			tag, tensor, sample_rate,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -19513,109 +20209,91 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 	return op.Output(0)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
 
-// QrFullMatrices sets the optional full_matrices attribute to value.
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// Computes the QR decompositions of one or more matrices.
+// Restores a tensor from checkpoint files.
 //
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
 //
-// ```prettyprint
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			input,
+			file_pattern, tensor_name, shape_and_slice,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+// A placeholder op that passes through `input` when its output is not fed.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// Arguments:
+//	input: The default value to produce when `output` is not fed.
+//	shape: The (possibly partial) shape of the tensor.
 //
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
+// Returns A placeholder tensor that defaults to `input` if it is not fed.
+func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "PlaceholderWithDefault",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// Deprecated. Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "TensorArrayReadV2",
 		Input: []tf.Input{
-			tag, tensor,
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -19623,31 +20301,65 @@ func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate flo
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
 
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, `a` is transposed before multiplication.
 // If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// Resize `images` to `size` using nearest neighbor interpolation.
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+//
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19656,68 +20368,94 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			images, size,
+			a, b, min_a, max_a, min_b, max_b,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
+
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x * y element-wise, working on quantized buffers.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			x, y,
+			x, y, min_x, max_x, min_y, max_y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
 
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
+// Enqueues zero or more tuples of one or more tensors in the given queue.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
 //
-// The op uses LU decomposition with partial pivoting to compute the inverses.
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19726,142 +20464,140 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "QueueEnqueueManyV2",
 		Input: []tf.Input{
-			input,
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
-
-// TFRecordReaderV2Container sets the optional container attribute to value.
+// Forwards the input to the output.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
+//
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
+	opspec := tf.OpSpec{
+		Type: "LoopCond",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// A Reader that outputs the records from a TensorFlow Records file.
+// Returns (x - y)(x - y) element-wise.
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
-		Attrs: attrs,
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations.
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			matrix, rhs,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
 
-// SumKeepDims sets the optional keep_dims attribute to value.
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a tensor.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...SumAttr) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19870,9 +20606,9 @@ func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			input, reduction_indices,
+			tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -19880,304 +20616,377 @@ func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 	return op.Output(0)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-//
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
-//
-// `data.shape` must start with `partitions.shape`.
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
 //
-// For example:
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
 //
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 //
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
+// ```prettyprint
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
 // ```
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/DynamicPartition.png" alt>
-// </div>
-//
 // Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
+		Type: "Qr",
 		Input: []tf.Input{
-			data, partitions,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
+	return op.Output(0), op.Output(1)
+}
+
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
 	}
-	return outputs
 }
 
-// Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			tag, tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Replaces the contents of the table with the specified keys and values.
 //
-// DEPRECATED at GraphDef version 17: Use Reciprocal
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "LookupTableImportV2",
 		Input: []tf.Input{
-			x,
+			table_handle, keys, values,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x / y element-wise for integer types.
-//
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = 1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
+
+// HashTableV2Container sets the optional container attribute to value.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
 //
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
-		},
+		Type: "HashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
+	return op.Output(0)
+}
+
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	return tensors
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
 
-// AvgPoolDataFormat sets the optional data_format attribute to value.
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Performs average pooling on the input.
+// Creates an empty hash table.
 //
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
-		Input: []tf.Input{
-			value,
-		},
+		Type: "MutableHashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
 //
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			var_, accum, lr, l1, l2, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
-
-// WholeFileReaderV2Container sets the optional container attribute to value.
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
+// value: If non-empty, this table is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
 // If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// A Reader that outputs the entire contents of a file as a value.
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
 //
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
+		Type: "MutableHashTableOfTensorsV2",
 
 		Attrs: attrs,
 	}
@@ -20185,137 +20994,142 @@ func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_
 	return op.Output(0)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
+// Table initializer that takes two tensors for keys and values respectively.
 //
 // Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
+		Type: "InitializeTableV2",
 		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
+			table_handle, keys, values,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["num_bits"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the filter.
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
+		Type: "FakeQuantWithMinMaxVarsGradient",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			gradients, inputs, min, max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+// Computes the gradient for the sqrt of `x` wrt its input.
 //
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
+		Type: "SqrtGrad",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
 
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
 // If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Computes the minimum of elements across dimensions of a tensor.
+// Computes the inverse of one or more square invertible matrices or their
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// adjoints (conjugate transposes).
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
+//
+// The op uses LU decomposition with partial pivoting to compute the inverses.
+//
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MinAttr) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20324,9 +21138,9 @@ func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Min",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			input, reduction_indices,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -20334,66 +21148,119 @@ func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 	return op.Output(0)
 }
 
-// Returns the number of work units this Reader has finished processing.
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// A Reader that outputs the records from a TensorFlow Records file.
+//
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+		Type: "TFRecordReaderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
+// Adjust the saturation of one or more images.
 //
-// If `x` and `y` are reals, this will return the floating-point division.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
+//
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RealDiv",
+		Type: "AdjustSaturation",
 		Input: []tf.Input{
-			x, y,
+			images, scale,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
 
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+//
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["compute_v"] = value
 	}
 }
 
-// Returns the shape of a tensor.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
-// For example:
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
 //
 // ```prettyprint
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
 // ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+//
+// Arguments:
+//	input: `Tensor` input of shape `[N, N]`.
+//
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20402,67 +21269,45 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "SelfAdjointEigV2",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
-//
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
 // If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// Solves systems of linear equations.
+//
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20471,9 +21316,9 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			grads, original_image,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
@@ -20481,31 +21326,32 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
 
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+// SumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+func SumKeepDims(value bool) SumAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Computes the gradient of nearest neighbor interpolation.
+// Computes the sum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...SumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20514,9 +21360,9 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
+		Type: "Sum",
 		Input: []tf.Input{
-			grads, size,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -20524,167 +21370,192 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// `data.shape` must start with `partitions.shape`.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+// For example:
+//
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
+//
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
+	opspec := tf.OpSpec{
+		Type: "DynamicPartition",
+		Input: []tf.Input{
+			data, partitions,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
 	}
+	return outputs
 }
 
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+// Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "SerializeSparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// Computes the reciprocal of x element-wise.
 //
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+// DEPRECATED at GraphDef version 17: Use Reciprocal
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "Inv",
 		Input: []tf.Input{
-			contents,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
-
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// Returns x / y element-wise for integer types.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = 1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
+//
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "TruncateDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// Restores tensors from a V2 checkpoint.
 //
-// All elements selected by `indices` must have the same shape.
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
+//
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
+//
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "RestoreV2",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			prefix, tensor_names, shape_and_slices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
 
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
@@ -20692,25 +21563,25 @@ type MaxPoolGradGradAttr func(optionalAttr)
 // Alternatively, the format could be "NCHW", the data storage order of:
 //     [batch, in_channels, in_height, in_width].
 // If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+func AvgPoolDataFormat(value string) AvgPoolAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Performs average pooling on the input.
+//
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
 //	padding: The type of padding algorithm to use.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20719,9 +21590,9 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "AvgPool",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -20729,237 +21600,214 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
-// 3D real-valued fast Fourier transform.
+// Computes the mean along sparse segments of a tensor.
 //
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			input, fft_length,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
-		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
-		},
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
-		Input: []tf.Input{
-			value,
-		},
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
+// A Reader that outputs the entire contents of a file as a value.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "ReaderReadUpToV2",
 		Input: []tf.Input{
-			images, delta,
+			reader_handle, queue_handle, num_records,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
 
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
-//
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
 // If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["compute_v"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
-//
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// ```prettyprint
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
 //
 // Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
+		Type: "Conv2DBackpropFilter",
 		Input: []tf.Input{
-			input,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
 //
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
+		Type: "Digamma",
 		Input: []tf.Input{
-			images, scale,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// EncodePngCompression sets the optional compression attribute to value.
+// MinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
-		m["compression"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// PNG-encode an image.
-//
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
-//
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20968,9 +21816,9 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
+		Type: "Min",
 		Input: []tf.Input{
-			image,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -20978,62 +21826,46 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
 	return func(m optionalAttr) {
-		m["fast"] = value
+		m["out_type"] = value
 	}
 }
 
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
-// in the least squares sense.
-//
-// matrix and right-hand sides in the batch:
-//
-// `matrix`=\\(A \in \Re^{m \times n}\\),
-// `rhs`=\\(B  \in \Re^{m \times k}\\),
-// `output`=\\(X  \in \Re^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||Z||_F^2 \\), subject to
-// \\(A Z = B\\). Notice that the fast path is only numerically stable when
-// \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-// sufficiently large.
-//
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// Returns the shape of a tensor.
 //
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+// For example:
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21042,9 +21874,9 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "Shape",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -21052,30 +21884,27 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "SoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			images,
+			features, labels,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
diff --git a/tensorflow/go/operation.go b/tensorflow/go/operation.go
index e8f67c4f7371d078c4f33c6f62b2af50b0af1d92..8fcad61f4c6eec597d2b14fb8c9b4fa59987a829 100644
--- a/tensorflow/go/operation.go
+++ b/tensorflow/go/operation.go
@@ -113,6 +113,11 @@ func (p Output) Shape() Shape {
 }
 
 func (p Output) c() C.TF_Output {
+	if p.Op == nil {
+		// Attempt to provide a more useful panic message than "nil
+		// pointer dereference".
+		panic("nil-Operation. If the Output was created with a Scope object, see Scope.Err() for details.")
+	}
 	return C.TF_Output{oper: p.Op.c, index: C.int(p.Index)}
 }
 
diff --git a/tensorflow/go/session.go b/tensorflow/go/session.go
index 3add412dcd8626d61d99313297bada677d8e844e..afa73030b8894c00ae2c619254bdfbc5068c9a53 100644
--- a/tensorflow/go/session.go
+++ b/tensorflow/go/session.go
@@ -199,7 +199,7 @@ func (s *Session) NewPartialRun(feeds, fetches []Output, targets []*Operation) (
 		return nil, err
 	}
 	runtime.SetFinalizer(pr, func(pr *PartialRun) {
-		deletePRunHandle(pr.handle)
+		C.TF_DeletePRunHandle(pr.handle)
 	})
 	return pr, nil
 }
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index f2904ad5a6942227e1952a1dbbd36c896e15aaa8..a8910248c1381b597f2cb4fc5ffb44896aa5aec2 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -54,6 +54,18 @@ java_test(
     ],
 )
 
+java_test(
+    name = "OperationTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/OperationTest.java"],
+    test_class = "org.tensorflow.OperationTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
 java_test(
     name = "SavedModelBundleTest",
     size = "small",
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 81e262db93fe9e6d9664cbfc27a03cb680e78dfb..337b55bccf025190d4eab48a72e27fe3b92c9fd5 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -1,154 +1,18 @@
 # TensorFlow for Java
 
-Java bindings for TensorFlow. ([Javadoc](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary))
-
-[![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow)
-
 > *WARNING*: The TensorFlow Java API is not currently covered by the TensorFlow
 > [API stability guarantees](https://www.tensorflow.org/programmers_guide/version_semantics).
 >
-> For using TensorFlow on Android refer to
+> For using TensorFlow on Android refer instead to
 > [contrib/android](https://www.tensorflow.org/code/tensorflow/contrib/android),
 > [makefile](https://www.tensorflow.org/code/tensorflow/contrib/makefile#android)
-> and/or the [Android
-> demo](https://www.tensorflow.org/code/tensorflow/examples/android).
-
-## Quickstart: Using [Apache Maven](https://maven.apache.org)
-
-TensorFlow for Java releases are included in
-[Maven Central](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.tensorflow%22%20AND%20a%3A%22tensorflow%22)
-and support Linux, OS X and Windows. To use it, add the following dependency to
-your project's `pom.xml`:
-
-```xml
-<dependency>
-  <groupId>org.tensorflow</groupId>
-  <artifactId>tensorflow</artifactId>
-  <version>1.1.0-rc2</version>
-</dependency>
-```
-
-That's all. As an example, to create a Maven project for the
-[label image example](https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java):
-
-1.  Create a `pom.xml`:
-
-    ```xml
-    <project>
-        <modelVersion>4.0.0</modelVersion>
-        <groupId>org.myorg</groupId>
-        <artifactId>label-image</artifactId>
-        <version>1.0-SNAPSHOT</version>
-        <properties>
-          <exec.mainClass>org.tensorflow.examples.LabelImage</exec.mainClass>
-          <!-- The LabelImage example code requires at least JDK 1.7. -->
-          <!-- The maven compiler plugin defaults to a lower version -->
-          <maven.compiler.source>1.7</maven.compiler.source>
-          <maven.compiler.target>1.7</maven.compiler.target>
-        </properties>
-        <dependencies>
-          <dependency>
-            <groupId>org.tensorflow</groupId>
-            <artifactId>tensorflow</artifactId>
-            <version>1.1.0-rc2</version>
-          </dependency>
-        </dependencies>
-    </project>
-    ```
-
-2.  Download the [example source](https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java)
-    into `src/main/java/org/tensorflow/examples`. On Linux and OS X, the following script should work:
-
-    ```sh
-    mkdir -p src/main/java/org/tensorflow/examples
-    curl -L "https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java" -o src/main/java/org/tensorflow/examples/LabelImage.java
-    ```
-
-3.  Compile and execute:
-
-    ```sh
-    mvn compile exec:java
-    ```
-
-## Quickstart: Using `java` and `javac`
-
-This section describes how to use TensorFlow armed with just a JDK installation.
-
-1.  Download the Java archive (JAR):
-    [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.1.0-rc2.jar)
-    (optionally, the Java sources:
-    [libtensorflow-src.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-src-1.1.0-rc2.jar)).
-
-2.  Download the native library. GPU-enabled versions required CUDA 8 and cuDNN
-    5.1. For other versions, the native library will need to be built from
-    source (see below).
-
-    -   Linux:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-linux-x86_64-1.1.0-rc2.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-gpu-linux-x86_64-1.1.0-rc2.tar.gz)
-    -   OS X:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-darwin-x86_64-1.1.0-rc2.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-gpu-darwin-x86_64-1.1.0-rc2.tar.gz)
-    -   Windows:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.1.0-rc2.zip)
-
-
-    The following shell snippet downloads and extracts the native library on
-    Linux and OS X. For Windows, download and extract manually.
-
-    ```sh
-    TF_TYPE="cpu" # Set to "gpu" to enable GPU support
-    OS=$(uname -s | tr '[:upper:]' '[:lower:]')
-    mkdir -p ./jni
-    curl -L \
-      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.1.0-rc2.tar.gz" |
-    tar -xz -C ./jni
-    ```
-
-3.  Include the downloaded `.jar` in the classpath during compilation. For
-    example, if your program looks like the following:
-
-    ```java
-    import org.tensorflow.TensorFlow;
-
-    public class MyClass {
-      public static void main(String[] args) {
-        System.out.println("I'm using TensorFlow version: " +  TensorFlow.version());
-      }
-    }
-    ```
-
-    then it should be compiled with:
+> and/or the [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
 
-    ```sh
-    javac -cp libtensorflow-1.1.0-rc2.jar MyClass.java
-    ```
+## Quickstart
 
-    For a more sophisticated example, see
-    [LabelImage.java](https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java),
-    which can be compiled with:
-
-    ```sh
-    javac \
-      -cp libtensorflow-1.1.0-rc2.jar \
-      ./src/main/java/org/tensorflow/examples/LabelImage.java
-    ```
-
-4.  Include the downloaded `.jar` in the classpath and the native library in the
-    library path during execution. For example:
-
-    ```sh
-    java -cp libtensorflow-1.1.0-rc2.jar:. -Djava.library.path=./jni MyClass
-    ```
-
-    or for the `LabelImage` example:
-
-    ```sh
-    java \
-      -Djava.library.path=./jni \
-      -cp libtensorflow-1.1.0-rc2.jar:./src/main/java \
-      org.tensorflow.examples.LabelImage
-    ```
+-   Refer to [Installing TensorFlow for Java](https://www.tensorflow.org/install/install_java)
+-   [Javadoc](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
+-   [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow)
 
 ## Building from source
 
@@ -172,7 +36,6 @@ native libraries will need to be built from source.
     brew install swig
     ```
 
-
 3.  [Configure](https://www.tensorflow.org/install/install_sources#configure_the_installation)
     (e.g., enable GPU support) and build:
 
@@ -183,20 +46,39 @@ native libraries will need to be built from source.
       //tensorflow/java:libtensorflow_jni
     ```
 
-The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so` on
-Linux, `libtensorflow_jni.dylib` on OS X, `tensorflow_jni.dll` on Windows) will
-be in `bazel-bin/tensorflow/java`. Using these artifacts follow both steps 3
-and 4 in the previous section in order to get your application
-up and running.
+The command above will produce two files in the `bazel-bin/tensorflow/java`
+directory:
+
+*   An archive of Java classes: `libtensorflow.jar`
+*   A native library: `libtensorflow_jni.so` on Linux, `libtensorflow_jni.dylib`
+    on OS X, or `tensorflow_jni.dll` on Windows.
+
+To compile Java code that uses the TensorFlow Java API, include
+`libtensorflow.jar` in the classpath. For example:
+
+```sh
+javac -cp bazel-bin/tensorflow/java/libtensorflow.jar ...
+```
+
+To execute the compiled program, include `libtensorflow.jar` in the classpath
+and the native library in the library path. For example:
+
+```sh
+java -cp bazel-bin/tensorflow/java/libtensorflow.jar \
+  -Djava.library.path=bazel-bin/tensorflow/java \
+  ...
+```
 
-Installation on Windows requires the more experimental [bazel on Windows](https://bazel.build/versions/master/docs/windows.html).
-Details are elided here, but find inspiration in the script used for
-building the release archive:
+Installation on Windows requires the more experimental [bazel on
+Windows](https://bazel.build/versions/master/docs/windows.html). Details are
+omitted here, but find inspiration in the script used for building the release
+archive:
 [`tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh`](https://www.tensorflow.org/code/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh).
 
 ### Maven
 
-Details of the release process for Maven Central are in [`maven/README.md`](https://www.tensorflow.org/code/tensorflow/java/maven/README.md).
+Details of the release process for Maven Central are in
+[`maven/README.md`](https://www.tensorflow.org/code/tensorflow/java/maven/README.md).
 However, for development, you can push the library built from source to a local
 Maven repository with:
 
@@ -207,14 +89,14 @@ mvn install:install-file \
   -DpomFile=../../bazel-bin/tensorflow/java/pom.xml
 ```
 
-And then rever to this library in a project's `pom.xml` with:
-(replacing 1.0.head with the appropriate version):
+And then refer to this library in a project's `pom.xml` with: (replacing
+VERSION with the appropriate version of TensorFlow):
 
 ```xml
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.0.head</version>
+  <version>VERSION</version>
 </dependency>
 ```
 
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index d1ef7067325a7f8029be0475efe4197c14913bef..e8817c3459dd64dc0c4baa66787aec535269c76b 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.1.0-rc2</version>
+    <version>1.1.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 98e8eba0312a829a2da620ed9cd9bbc49fbfeb1d..65f331979ff23e7a4ee143cd685104234498ecd1 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.1.0-rc2</version>
+    <version>1.1.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 7bb191c7c70f7020224d24764834b5238a0925d7..59d798effe8df25645056437c132da2318e754b7 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.1.0-rc2</version>
+  <version>1.1.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 97feb735a1af4cc5ee122b5975e43883ee329c26..a306dd67691060690346a2cbe807ee43f359b894 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.1.0-rc2</version>
+    <version>1.1.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 0789cca40b4b59f424da927a05b9b85502e401c3..74adb35ba8d1fc2332acfc8cca41416c02e2c955 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.1.0-rc2</version>
+    <version>1.1.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Graph.java b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
index 42d7f484644af9d1ebb0b7b5504430c2ba24fbe8..c08fa9b14574a8a219609c754faaa3a395283fd7 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Graph.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
@@ -20,7 +20,7 @@ package org.tensorflow;
  *
  * <p>Instances of a Graph are thread-safe.
  *
- * <p><b>WARNING:</b> Resources consumed by the Graph object msut be explicitly freed by invoking
+ * <p><b>WARNING:</b> Resources consumed by the Graph object must be explicitly freed by invoking
  * the {@link #close()} method then the Graph object is no longer needed.
  */
 public final class Graph implements AutoCloseable {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operation.java b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
index 48db554e072707ecb33c2008e1af41761f415729..43dbaf125c9b76dcae645b2eb9d8deba42c6c521 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Operation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
@@ -70,6 +70,28 @@ public final class Operation {
     }
   }
 
+  /**
+   * Returns the size of the list of Tensors produced by this operation.
+   *
+   * <p>An Operation has multiple named outputs, each of which produces either
+   * a single tensor or a list of tensors. This method returns the size of
+   * the list of tensors for a specific named output of the operation.
+   *
+   * @param name identifier of the list of tensors (of which there may
+   *        be many) produced by this operation.
+   * @returns the size of the list of Tensors produced by this named output.
+   * @throws IllegalArgumentException if this operation has no output
+   *         with the provided name.
+   */
+  public int outputListLength(final String name) {
+    Graph.Reference r = graph.ref();
+    try {
+      return outputListLength(unsafeNativeHandle, name);
+    } finally {
+      r.close();
+    }
+  }
+
   /** Returns a symbolic handle to one of the tensors produced by this operation. */
   public Output output(int idx) {
     return new Output(this, idx);
@@ -108,6 +130,8 @@ public final class Operation {
 
   private static native int numOutputs(long handle);
 
+  private static native int outputListLength(long handle, String name);
+
   private static native long[] shape(long graphHandle, long opHandle, int output);
 
   private static native int dtype(long graphHandle, long opHandle, int output);
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index cd59cf504a7166540818c36c97a58fdc46214bab..8f2b936eb04aa53ce74e59d42d1f599f57a25830 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -73,6 +73,31 @@ public final class OperationBuilder {
     return this;
   }
 
+  /**
+   * Ensure that the operation does not execute before the control operation does.
+   *
+   * <p>A control input is an Operation that must be executed before
+   * running the operation currently being built.
+   *
+   * <p>For example, an Assert operation may be added as a control
+   * input for this operation. The Assert now behaves as a
+   * pre-condition that will always verify itself before running the
+   * operation.
+   *
+   * @param control operation that must be executed before running this
+   *     operation.
+   * @return the OperationBuilder instance for chaining.
+   */
+  public OperationBuilder addControlInput(Operation control) {
+    Graph.Reference r = graph.ref();
+    try {
+      addControlInput(unsafeNativeHandle, control.getUnsafeNativeHandle());
+    } finally {
+      r.close();
+    }
+    return this;
+  }
+
   public OperationBuilder addInputList(Output[] inputs) {
     Graph.Reference r = graph.ref();
     try {
@@ -244,6 +269,8 @@ public final class OperationBuilder {
 
   private static native void addInputList(long handle, long[] opHandles, int[] indices);
 
+  private static native void addControlInput(long handle, long opHandle);
+
   private static native void setDevice(long handle, String device);
 
   // The names of all the setAttr* family functions below correspond to the C library types, not the
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index 45332bc73f3ecc9148885a1b750d86c4229df86c..0d071e1674e3a7951248742b911f023a0dce0edf 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -120,10 +120,15 @@ public final class Session implements AutoCloseable {
     /**
      * Avoid evaluating {@code operation} and substitute {@code t} for the value it produces.
      *
-     * <p>This method is a shorthand for {@code feed(operation, 0, t)}.
+     * @param operation Is either the string name of the operation, in which case this method is a
+     *     shorthand for {@code feed(operation, 0)}, or it is a string of the form
+     *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
+     *     feed(operation_name, output_index)}. These colon-separated names are commonly used in the
+     *     {@code SignatureDef} protocol buffer messages that are included in {@link
+     *     SavedModelBundle.metaGraphDef()}.
      */
     public Runner feed(String operation, Tensor t) {
-      return feed(operation, 0, t);
+      return feed(parseOutput(operation), t);
     }
 
     /**
@@ -155,10 +160,15 @@ public final class Session implements AutoCloseable {
     /**
      * Make {@link #run()} return the output of {@code operation}.
      *
-     * <p>This method is a shorthand for {@code fetch(operation, 0)}
+     * @param operation Is either the string name of the operation, in which case this method is a
+     *     shorthand for {@code fetch(operation, 0)}, or it is a string of the form
+     *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
+     *     fetch(operation_name, output_index)}. These colon-separated names are commonly used in
+     *     the {@code SignatureDef} protocol buffer messages that are included in {@link
+     *     SavedModelBundle.metaGraphDef()}.
      */
     public Runner fetch(String operation) {
-      return fetch(operation, 0);
+      return fetch(parseOutput(operation));
     }
 
     /**
@@ -345,6 +355,20 @@ public final class Session implements AutoCloseable {
       return op;
     }
 
+    private Output parseOutput(String opName) {
+      int colon = opName.lastIndexOf(':');
+      if (colon == -1 || colon == opName.length() - 1) {
+        return new Output(operationByName(opName), 0);
+      }
+      try {
+        String op = opName.substring(0, colon);
+        int index = Integer.parseInt(opName.substring(colon + 1));
+        return new Output(operationByName(op), index);
+      } catch (NumberFormatException e) {
+        return new Output(operationByName(opName), 0);
+      }
+    }
+
     private ArrayList<Output> inputs = new ArrayList<Output>();
     private ArrayList<Tensor> inputTensors = new ArrayList<Tensor>();
     private ArrayList<Output> outputs = new ArrayList<Output>();
diff --git a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
index ff3113372900abcb2a548385edcbe0cc603934ce..c21214b76311249690237af0753d6e65cbf3e230 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
@@ -20,6 +20,15 @@ public final class TensorFlow {
   /** Returns the version of the underlying TensorFlow runtime. */
   public static native String version();
 
+  /**
+   * All the TensorFlow operations available in this address space.
+   *
+   * @return A serialized representation of an <a
+   *     href="https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto">OpList</a>
+   *     protocol buffer, which lists all the available TensorFlow operations.
+   */
+  public static native byte[] registeredOpList();
+
   private TensorFlow() {}
 
   /** Load the TensorFlow runtime C library. */
@@ -30,5 +39,4 @@ public final class TensorFlow {
   static {
     init();
   }
-
 }
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.cc b/tensorflow/java/src/main/native/operation_builder_jni.cc
index 5724c54f9116c269efdd842646bfc3da47d57ab0..4c54eecd9b5904c3cdd03f2373ea89bf2f14cf63 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.cc
+++ b/tensorflow/java/src/main/native/operation_builder_jni.cc
@@ -115,6 +115,20 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
   TF_AddInputList(d, o.get(), n);
 }
 
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addControlInput(
+    JNIEnv* env, jclass clazz, jlong handle, jlong op_handle) {
+  if (op_handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "control input is not valid, "
+                   "perhaps the Graph containing it has been closed()?");
+    return;
+  }
+  TF_Operation* control = reinterpret_cast<TF_Operation*>(op_handle);
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  TF_AddControlInput(d, control);
+}
+
 JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setDevice(
     JNIEnv* env, jclass clazz, jlong handle, jstring device) {
   TF_OperationDescription* d = requireHandle(env, handle);
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.h b/tensorflow/java/src/main/native/operation_builder_jni.h
index ae953c0fd63a473d3336b63ecd2954730cd3aab7..9b64c328203ad406953dea0e9cddcf6f468c043d 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.h
+++ b/tensorflow/java/src/main/native/operation_builder_jni.h
@@ -55,6 +55,14 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInput(
 JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
     JNIEnv *, jclass, jlong, jlongArray, jintArray);
 
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    addControlInput
+ * Signature: (JJ)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addControlInput(
+    JNIEnv *, jclass, jlong, jlong);
+
 /*
  * Class:     org_tensorflow_OperationBuilder
  * Method:    setDevice
diff --git a/tensorflow/java/src/main/native/operation_jni.cc b/tensorflow/java/src/main/native/operation_jni.cc
index 32e59bc0aedf59a12c9b85de79e5d4faef8aaf77..b3d5fc4ec374fe6e5214799581878d94315a7ea7 100644
--- a/tensorflow/java/src/main/native/operation_jni.cc
+++ b/tensorflow/java/src/main/native/operation_jni.cc
@@ -66,6 +66,24 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_numOutputs(JNIEnv* env,
   return TF_OperationNumOutputs(op);
 }
 
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_outputListLength(JNIEnv* env,
+                                                                      jclass clazz,
+                                                                      jlong handle,
+                                                                      jstring name) {
+  TF_Operation* op = requireHandle(env, handle);
+  if (op == nullptr) return 0;
+
+  TF_Status* status = TF_NewStatus();
+
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  int result = TF_OperationOutputListLength(op, cname, status);
+  env->ReleaseStringUTFChars(name, cname);
+
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+  return result;
+}
+
 JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Operation_shape(
     JNIEnv* env, jclass clazz, jlong graph_handle, jlong op_handle,
     jint output_index) {
diff --git a/tensorflow/java/src/main/native/operation_jni.h b/tensorflow/java/src/main/native/operation_jni.h
index 6292a48069c5719760932fb297bf583ce793bf6c..b5d156f7c2749f7fbba3145f79e269f12e53a055 100644
--- a/tensorflow/java/src/main/native/operation_jni.h
+++ b/tensorflow/java/src/main/native/operation_jni.h
@@ -46,6 +46,16 @@ JNIEXPORT jstring JNICALL Java_org_tensorflow_Operation_type(JNIEnv *, jclass,
 JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_numOutputs(JNIEnv *,
                                                                 jclass, jlong);
 
+/*
+ * Class:     org_tensorflow_Operation
+ * Method:    outputListLength
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_outputListLength(JNIEnv *,
+                                                                      jclass,
+                                                                      jlong,
+                                                                      jstring);
+
 /*
  * Class:     org_tensorflow_Operation
  * Method:    shape
diff --git a/tensorflow/java/src/main/native/tensorflow_jni.cc b/tensorflow/java/src/main/native/tensorflow_jni.cc
index 746550adbd24221e122effc11c28a0bb905fb283..c553582e38d34c67d58bf4501d9c1686b29f9a73 100644
--- a/tensorflow/java/src/main/native/tensorflow_jni.cc
+++ b/tensorflow/java/src/main/native/tensorflow_jni.cc
@@ -20,3 +20,13 @@ JNIEXPORT jstring JNICALL Java_org_tensorflow_TensorFlow_version(JNIEnv* env,
                                                                  jclass clazz) {
   return env->NewStringUTF(TF_Version());
 }
+
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_TensorFlow_registeredOpList(JNIEnv* env, jclass clazz) {
+  TF_Buffer* buf = TF_GetAllOpList();
+  jint length = static_cast<int>(buf->length);
+  jbyteArray ret = env->NewByteArray(length);
+  env->SetByteArrayRegion(ret, 0, length, static_cast<const jbyte*>(buf->data));
+  TF_DeleteBuffer(buf);
+  return ret;
+}
diff --git a/tensorflow/java/src/main/native/tensorflow_jni.h b/tensorflow/java/src/main/native/tensorflow_jni.h
index 102951c472c38d3ce9ad2c4091eae3507fb6f8df..ecd9b15828dea07ab43ac60a0d148ba17a21af11 100644
--- a/tensorflow/java/src/main/native/tensorflow_jni.h
+++ b/tensorflow/java/src/main/native/tensorflow_jni.h
@@ -30,6 +30,14 @@ extern "C" {
 JNIEXPORT jstring JNICALL Java_org_tensorflow_TensorFlow_version(JNIEnv*,
                                                                  jclass);
 
+/*
+ * Class:     org_tensorflow_TensorFlow
+ * Method:    registeredOpList
+ * Signature: ()[B
+ */
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_TensorFlow_registeredOpList(JNIEnv*, jclass);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
index 951136180def4401a7c72beb6173ffad54ca5e12..8b468f5beecaafface16b6fa3ad0903a9ef63568 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
@@ -101,7 +101,7 @@ public class OperationBuilderTest {
       assertTrue(hasNode(g, "StringAndBool"));
       // int (TF "int" attributes are 64-bit signed, so a Java long).
       g.opBuilder("RandomUniform", "Int")
-          .addInput(TestUtil.constant(g, "RandomUniformShape", 1))
+          .addInput(TestUtil.constant(g, "RandomUniformShape", new int[]{1}))
           .setAttr("seed", 10)
           .setAttr("dtype", DataType.FLOAT)
           .build();
@@ -149,6 +149,34 @@ public class OperationBuilderTest {
     }
   }
 
+  @Test
+  public void addControlInput() {
+    try (Graph g = new Graph();
+         Session s = new Session(g);
+         Tensor yes = Tensor.create(true);
+         Tensor no = Tensor.create(false)) {
+      Output placeholder = TestUtil.placeholder(g, "boolean", DataType.BOOL);
+      Operation check = g.opBuilder("Assert", "assert")
+          .addInput(placeholder)
+          .addInputList(new Output[]{placeholder})
+          .build();
+      Operation noop = g.opBuilder("NoOp", "noop")
+          .addControlInput(check)
+          .build();
+
+      // No problems when the Assert check succeeds
+      s.runner().feed(placeholder, yes).addTarget(noop).run();
+
+      // Exception thrown by the execution of the Assert node
+      try {
+        s.runner().feed(placeholder, no).addTarget(noop).run();
+        fail("Did not run control operation.");
+      } catch (IllegalArgumentException e) {
+        // expected
+      }
+    }
+  }
+
   private static boolean hasNode(Graph g, String name) {
     return g.operation(name) != null;
   }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..101839e6d74536d82aa1eaa5b3a9bd11b6462ca6
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+
+/** Unit tests for {@link org.tensorflow.Operation}. */
+@RunWith(JUnit4.class)
+public class OperationTest {
+
+  @Test
+  public void outputListLengthFailsOnInvalidName() {
+    try (Graph g = new Graph()) {
+      Operation op =
+          g.opBuilder("Add", "Add")
+              .addInput(TestUtil.constant(g, "x", 1))
+              .addInput(TestUtil.constant(g, "y", 2))
+              .build();
+      assertEquals(1, op.outputListLength("z"));
+
+      try {
+        op.outputListLength("unknown");
+        fail("Did not catch bad name");
+      } catch (IllegalArgumentException iae) {
+        // expected
+      }
+    }
+  }
+
+  @Test
+  public void outputListLength() {
+    assertEquals(1, split(new int[] {0, 1}, 1));
+    assertEquals(2, split(new int[] {0, 1}, 2));
+    assertEquals(3, split(new int[] {0, 1, 2}, 3));
+  }
+
+  private int split(int[] values, int num_split) {
+    try (Graph g = new Graph()) {
+      return g.opBuilder("Split", "Split")
+          .addInput(TestUtil.constant(g, "split_dim", 0))
+          .addInput(TestUtil.constant(g, "values", values))
+          .setAttr("num_split", num_split)
+          .build()
+          .outputListLength("output");
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
index 2ccbdf51bc5a8287eb552ebf99925074aaa7fb3d..0d2dbc5b88006d497bfcf8d70c48ed7bb93d5538 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
@@ -62,6 +62,36 @@ public class SessionTest {
     }
   }
 
+  @Test
+  public void runUsingColonSeparatedNames() {
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+      Operation split =
+          g.opBuilder("Split", "Split")
+              .addInput(TestUtil.constant(g, "split_dim", 0))
+              .addInput(TestUtil.constant(g, "value", new int[] {1, 2, 3, 4}))
+              .setAttr("num_split", 2)
+              .build();
+      g.opBuilder("Add", "Add")
+          .addInput(split.output(0))
+          .addInput(split.output(1))
+          .build()
+          .output(0);
+      // Fetch using colon separated names.
+      try (Tensor fetched = s.runner().fetch("Split:1").run().get(0)) {
+        final int[] expected = {3, 4};
+        assertArrayEquals(expected, fetched.copyTo(new int[2]));
+      }
+      // Feed using colon separated names.
+      try (Tensor fed = Tensor.create(new int[] {4, 3, 2, 1});
+          Tensor fetched =
+              s.runner().feed("Split:0", fed).feed("Split:1", fed).fetch("Add").run().get(0)) {
+        final int[] expected = {8, 6, 4, 2};
+        assertArrayEquals(expected, fetched.copyTo(new int[4]));
+      }
+    }
+  }
+
   @Test
   public void runWithMetadata() {
     try (Graph g = new Graph();
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
index ff89aeffbbc4042920d476f75c043ea83a9aa490..27e2215f62b2ad30c14005666ade2faa7585b150 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
@@ -28,4 +28,12 @@ public class TensorFlowTest {
   public void version() {
     assertTrue(TensorFlow.version().length() > 0);
   }
+
+  @Test
+  public void registeredOpList() {
+    // Would be nice to actually parse the output as a tensorflow.OpList protocol buffer message,
+    // but as of May 2017, bazel support for generating Java code from protocol buffer definitions
+    // was not sorted out. Revisit? Till then, at least excercise the code.
+    assertTrue(TensorFlow.registeredOpList().length > 0);
+  }
 }
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 2a61735bb59cb27391c9213c8d5dc82fe54dc572..648745e931e5e697e318ea246afb8722bccc16c8 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -42,6 +42,7 @@ py_library(
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
     deps = [
+        ":tf_optimizer",
         ":array_ops",
         ":check_ops",
         ":client",
@@ -80,7 +81,9 @@ py_library(
         ":weights_broadcast_ops",
         "//third_party/py/numpy",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/ops/distributions",
         "//tensorflow/python/saved_model",
     ] + if_not_windows([
         "//tensorflow/contrib:contrib_py",
@@ -1018,7 +1021,6 @@ tf_gen_op_wrapper_private_py(
     require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/contrib/lookup:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
     ],
 )
@@ -1053,6 +1055,16 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "lookup_ops_gen",
+    require_shape_functions = True,
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/contrib/lookup:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     require_shape_functions = True,
@@ -1320,6 +1332,7 @@ py_library(
     deps = [
         ":array_ops",
         ":clip_ops",
+        ":data_flow_grad",
         ":data_flow_ops",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -1470,6 +1483,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "lookup_ops",
+    srcs = ["ops/lookup_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":lookup_ops_gen",
+        ":math_ops",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "math_grad",
     srcs = ["ops/math_grad.py"],
@@ -1858,6 +1885,7 @@ py_library(
         ":io_ops",
         ":linalg_ops",
         ":logging_ops",
+        ":lookup_ops",
         ":math_grad",
         ":math_ops",
         ":numerics",
@@ -2157,6 +2185,7 @@ cuda_py_test(
     srcs = ["ops/math_ops_test.py"],
     additional_deps = [
         ":array_ops",
+        ":errors",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":gradients",
@@ -2265,6 +2294,7 @@ py_library(
         ":io_ops",
         ":io_ops_gen",
         ":lib",
+        ":lookup_ops",
         ":math_ops",
         ":platform",
         ":protos_all_py",
@@ -2644,6 +2674,7 @@ py_library(
         ":errors",
         ":pywrap_tensorflow",
         ":util",
+        "@six_archive//:six",
     ],
 )
 
@@ -2987,6 +3018,7 @@ cuda_py_tests(
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
+        ":lookup_ops",
         ":gradients",
         ":math_ops",
         ":nn_grad",
@@ -3017,7 +3049,7 @@ py_library(
     srcs = ["training/saver_test_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":data_flow_ops_gen",
+        ":lookup_ops_gen",
         ":training",
     ],
 )
@@ -3580,3 +3612,20 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
+
+py_test(
+    name = "memory_optimizer_test",
+    size = "medium",
+    srcs = [
+        "grappler/memory_optimizer_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 864a96ef348741c632f9039db4fe9bc1442352b2..d4f8b8b2f81a9b65b2df311a01fa43d6eeee305d 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -55,6 +55,7 @@ from tensorflow.core.framework.summary_pb2 import *
 from tensorflow.core.framework.attr_value_pb2 import *
 from tensorflow.core.protobuf.meta_graph_pb2 import TensorInfo
 from tensorflow.core.protobuf.config_pb2 import *
+from tensorflow.core.protobuf.tensorflow_server_pb2 import *
 from tensorflow.core.protobuf.rewriter_config_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 
@@ -74,6 +75,7 @@ from tensorflow.python.ops.standard_ops import *
 
 # Bring in subpackages.
 from tensorflow.python.estimator import estimator_lib as estimator
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import metrics
@@ -131,6 +133,7 @@ _allowed_symbols = [
     'AttrValue',
     'AutoParallelOptions',
     'ConfigProto',
+    'ClusterDef',
     'DeviceSpec',
     'Event',
     'GPUOptions',
@@ -210,6 +213,7 @@ _allowed_symbols.extend([
     'compat',
     'errors',
     'estimator',
+    'feature_column',
     'flags',
     'gfile',
     'graph_util',
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 7981defe78b0fd9385990548920ecd9893b45e82..700e95c0b9655f4b189f0d65c0f4639fa7bd9c8f 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -594,6 +594,9 @@ class BaseSession(SessionInterface):
       try:
         status = tf_session.TF_NewStatus()
         tf_session.TF_DeleteDeprecatedSession(self._session, status)
+      except AttributeError:
+        # 'NoneType' object has no attribute 'TF_NewStatus'
+        pass
       finally:
         if status is not None:
           tf_session.TF_DeleteStatus(status)
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index e6f1c57c7db7f15afe77bdc3e4c5f5f2ee2b5d16..5a42c50fff71bcba6738caac68b4d48d9e2c3338 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -29,7 +29,9 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.lib.core import error_codes_pb2
+from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
@@ -44,6 +46,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -1431,6 +1434,55 @@ class SessionTest(test_util.TensorFlowTestCase):
                                  'You must feed a value for placeholder'):
       sess.partial_run(handle, fetches[0])
 
+  def runTestPartialRunUnspecifiedFeed(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+
+    h = sess.partial_run_setup([r1], [a, b])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'was not specified in partial_run_setup.$'):
+      sess.partial_run(h, r1, feed_dict={a: 1, b: 2, c: 3})
+
+  def runTestPartialRunUnspecifiedFetch(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(a, c)
+
+    h = sess.partial_run_setup([r1], [a, b, c])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'was not specified in partial_run_setup.$'):
+      sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
+
+  def runTestPartialRunAlreadyFed(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(a, c)
+
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'has already been fed.$'):
+      sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
+
+  def runTestPartialRunAlreadyFetched(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(a, c)
+
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'has already been fetched.$'):
+      sess.partial_run(h, r1, feed_dict={c: 3})
+
   def testInvalidPartialRunSetup(self):
     sess = session.Session()
     x = array_ops.placeholder(dtypes.float32, shape=[])
@@ -1457,6 +1509,18 @@ class SessionTest(test_util.TensorFlowTestCase):
   def testPartialRunMissingPlaceholderFeedExceptionDirect(self):
     self.runTestPartialRunMissingPlaceholderFeedException(session.Session())
 
+  def testPartialRunUnspecifiedFeedDirect(self):
+    self.runTestPartialRunUnspecifiedFeed(session.Session())
+
+  def testPartialRunUnspecifiedFetchDirect(self):
+    self.runTestPartialRunUnspecifiedFetch(session.Session())
+
+  def testPartialRunAlreadyFedDirect(self):
+    self.runTestPartialRunAlreadyFed(session.Session())
+
+  def testPartialRunAlreadyFetchedDirect(self):
+    self.runTestPartialRunAlreadyFetched(session.Session())
+
   def testPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.runTestPartialRun(session.Session(server.target))
@@ -1482,6 +1546,22 @@ class SessionTest(test_util.TensorFlowTestCase):
     self.runTestPartialRunMissingPlaceholderFeedException(
         session.Session(server.target))
 
+  def testPartialRunUnspecifiedFeedDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunUnspecifiedFeed(session.Session(server.target))
+
+  def testPartialRunUnspecifiedFetchDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunUnspecifiedFetch(session.Session(server.target))
+
+  def testPartialRunAlreadyFedDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunAlreadyFed(session.Session(server.target))
+
+  def testPartialRunAlreadyFetchedDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunAlreadyFetched(session.Session(server.target))
+
   def testFeedDictKeyException(self):
     with session.Session() as sess:
       a = constant_op.constant(1.0, dtypes.float32, name='a')
@@ -1712,7 +1792,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with CaptureStderr() as log:
         sess.run(c)
       # Ensure that we did log device placement.
-      self.assertTrue('/job:local/replica:0/task:0/cpu:0' in str(log))
+      self.assertTrue('/job:local/replica:0/task:0/cpu:0' in str(log), str(log))
 
   def testLocalMasterSessionTimeout(self):
     # Test that the timeout passed in a config to the session works correctly.
@@ -1757,6 +1837,307 @@ class SessionTest(test_util.TensorFlowTestCase):
     server = server_lib.Server.create_local_server()
     self.runTestBuildGraphError(session.Session(server.target))
 
+  def testClusterSpecPropagationSimple(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config)
+    output = sess.run(const)
+    self.assertEqual(17, output)
+
+  def testClusterSpecPropagationWorker2Placement(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g, ops.device('/job:worker/task:1'):
+      const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config, graph=g)
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+    output = sess.run(const, options=run_options, run_metadata=run_metadata)
+    self.assertEqual(17, output)
+    self.assertEqual(1,
+                     len([
+                         node_stats
+                         for dev_stats in run_metadata.step_stats.dev_stats
+                         for node_stats in dev_stats.node_stats
+                         if '/job:worker/replica:0/task:1/device:CPU:0' ==
+                         dev_stats.device and 'Const' == node_stats.node_name
+                     ]))
+
+  def testClusterSpecPropagationWorker1Placement(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g, ops.device('/job:worker/task:0'):
+      const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config, graph=g)
+    output = sess.run(const)
+    self.assertEqual(17, output)
+
+  def testClusterSpecPropagationThreeServers2Graphs(self):
+    """Boots 3 servers, creates 2 sessions, ensures appropriate operations.
+
+    We create 2 clusterspecs:
+     1. server2 as the master, server1 as a worker
+     2. server2 as the master, server3 as a worker
+
+    We ensure that variables on the workers are independent.
+    """
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    server3 = server_lib.Server.create_local_server()
+    cluster_def1 = cluster_pb2.ClusterDef()
+    job1 = cluster_def1.job.add()
+    job1.name = 'worker1'
+    job1.tasks[0] = server2.target[len('grpc://'):]
+    job1.tasks[1] = server1.target[len('grpc://'):]
+
+    cluster_def2 = cluster_pb2.ClusterDef()
+    job2 = cluster_def2.job.add()
+    job2.name = 'worker2'
+    job2.tasks[0] = server2.target[len('grpc://'):]
+    job2.tasks[1] = server3.target[len('grpc://'):]
+
+    config1 = config_pb2.ConfigProto(cluster_def=cluster_def1)
+    config2 = config_pb2.ConfigProto(cluster_def=cluster_def2)
+
+    with ops.Graph().as_default() as g1:
+      with ops.device('/job:worker1/task:1'):
+        var1 = variables.Variable(array_ops.zeros([2]), name='var1')
+        update_op1 = state_ops.assign_add(
+            var1, array_ops.ones([2]), name='var1_assign_add')
+        init1 = variables.global_variables_initializer()
+
+    with ops.Graph().as_default() as g2:
+      with ops.device('/job:worker2/task:1'):
+        var2 = variables.Variable(array_ops.zeros([2]), name='var2')
+        update_op2 = state_ops.assign_add(
+            var2, array_ops.ones([2]), name='var2_assign_add')
+        init2 = variables.global_variables_initializer()
+
+    sess1 = session.Session(server2.target, graph=g1, config=config1)
+    sess2 = session.Session(server2.target, graph=g2, config=config2)
+
+    init1.run(session=sess1)
+    init2.run(session=sess2)
+
+    expected_zeros = np.zeros([2])
+    expected_ones = np.ones([2])
+
+    self.assertAllEqual(expected_zeros, sess1.run(var1))
+    self.assertAllEqual(expected_zeros, sess2.run(var2))
+
+    self.assertAllEqual(expected_ones, sess1.run(update_op1))
+    self.assertAllEqual(expected_ones, sess1.run(var1))
+    self.assertAllEqual(expected_zeros, sess2.run(var2))
+    self.assertAllEqual(expected_ones, sess2.run(update_op2))
+    self.assertAllEqual(expected_ones + expected_ones, sess1.run(update_op1))
+    self.assertAllEqual(expected_ones, sess2.run(var2))
+    self.assertAllEqual(expected_ones + expected_ones, sess1.run(var1))
+
+  def testClusterSpecPropagationThreeServers(self):
+    """Boots 3 servers, creates 2 sessions, ensures appropriate operations.
+
+    We create 2 clusterspecs:
+     1. server2 as the master, server1 as a worker
+     2. server2 as the master, server3 as a worker
+
+    We ensure that variables on the workers are independent.
+    """
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    server3 = server_lib.Server.create_local_server()
+    cluster_def1 = cluster_pb2.ClusterDef()
+    job1 = cluster_def1.job.add()
+    job1.name = 'worker'
+    job1.tasks[0] = server2.target[len('grpc://'):]
+    job1.tasks[1] = server1.target[len('grpc://'):]
+
+    cluster_def2 = cluster_pb2.ClusterDef()
+    job2 = cluster_def2.job.add()
+    job2.name = 'worker'
+    job2.tasks[0] = server2.target[len('grpc://'):]
+    job2.tasks[1] = server3.target[len('grpc://'):]
+
+    config1 = config_pb2.ConfigProto(cluster_def=cluster_def1)
+    config2 = config_pb2.ConfigProto(cluster_def=cluster_def2)
+
+    with ops.device('/job:worker/task:1'):
+      var = variables.Variable(array_ops.zeros([2]), name='var')
+      feed = array_ops.placeholder(dtypes.float32, shape=(2))
+      update_op = var.assign_add(feed)
+
+    sess1 = session.Session(server2.target, config=config1)
+    sess2 = session.Session(server2.target, config=config2)
+
+    variables.global_variables_initializer().run(session=sess1)
+    variables.global_variables_initializer().run(session=sess2)
+
+    expected_zeros = np.zeros([2])
+    expected_ones = np.ones([2])
+
+    self.assertAllEqual(expected_zeros, sess1.run(var))
+    self.assertAllEqual(expected_zeros, sess2.run(var))
+    self.assertAllEqual(expected_ones,
+                        sess1.run(update_op, feed_dict={feed: expected_ones}))
+    self.assertAllEqual(expected_ones, sess1.run(var))
+    self.assertAllEqual(expected_zeros, sess2.run(var))
+    self.assertAllEqual(expected_ones,
+                        sess2.run(update_op, feed_dict={feed: expected_ones}))
+    self.assertAllEqual(expected_ones + expected_ones,
+                        sess1.run(update_op, feed_dict={feed: expected_ones}))
+    self.assertAllEqual(expected_ones, sess2.run(var))
+    self.assertAllEqual(expected_ones + expected_ones, sess1.run(var))
+
+  def testClusterSpecPropagationThreeServersOneCluster(self):
+    """Boots 3 servers, ensures appropriate communication across workers.
+
+    Additionally, in this cluster, we ensure the master is not the 0-th worker.
+
+    Note: this test only uses one session.
+    """
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    server3 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server3.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    job.tasks[2] = server1.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    # Add ops to the devices in non-linear order.
+
+    with ops.device('/job:worker/task:1'):
+      feed1 = array_ops.placeholder(dtypes.float32, shape=(2))
+      const1 = constant_op.constant(2.0)
+      mul1 = const1 * feed1
+
+    with ops.device('/job:worker/task:2'):
+      feed2 = array_ops.placeholder(dtypes.float32, shape=(2))
+      const2 = constant_op.constant(2.0)
+      mul2 = const2 * feed2
+
+    with ops.device('/job:worker/task:0'):
+      feed0 = array_ops.placeholder(dtypes.float32, shape=(2))
+      const0 = constant_op.constant(2.0)
+      mul0 = const0 * feed0
+
+    sum_op = mul0 + mul1 + mul2
+
+    ones = np.ones([2])
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    # Run!
+    with session.Session(server1.target, config=config) as sess:
+      output = sess.run(
+          sum_op,
+          options=run_options,
+          run_metadata=run_metadata,
+          feed_dict={feed1: ones,
+                     feed2: ones,
+                     feed0: ones})
+      self.assertAllEqual(6 * ones, output)
+
+      self.assertEqual(
+          3,
+          len([
+              dev_stats.device
+              for dev_stats in run_metadata.step_stats.dev_stats
+              for node_stats in dev_stats.node_stats
+              if '/job:worker/replica:0/task:' in dev_stats.device and
+              node_stats.node_name.startswith('Const')
+          ]), run_metadata)
+
+  def testClusterSpecPropagationPartialRun(self):
+    """Test successful partial run with ClusterSpec propagation."""
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.device('/job:worker/task:0'):
+      a = array_ops.placeholder(dtypes.float32, shape=[])
+    with ops.device('/job:worker/task:1'):
+      b = array_ops.placeholder(dtypes.float32, shape=[])
+      c = array_ops.placeholder(dtypes.float32, shape=[])
+      r1 = math_ops.add(a, b)
+    with ops.device('/job:worker/task:0'):
+      r2 = math_ops.multiply(r1, c)
+
+    with session.Session(server1.target, config=config) as sess:
+      h = sess.partial_run_setup([r1, r2], [a, b, c])
+      res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+      self.assertEqual(3, res)
+      res = sess.partial_run(h, r2, feed_dict={c: 3})
+      self.assertEqual(9, res)
+
+  def testGraphOptimizer(self):
+    rewrite_options = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=False, constant_folding=True)
+    graph_options = config_pb2.GraphOptions(
+        rewrite_options=rewrite_options, build_cost_model=1)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+
+    with ops.Graph().as_default() as g:
+      r1 = random_ops.random_normal(shape=[2, 3], name='R1')
+      r2 = random_ops.random_normal(shape=[2, 3], name='R2')
+      copy1 = array_ops.stop_gradient(r1)
+      copy2 = array_ops.identity(r2)
+      result = copy1 + copy2
+
+      with session.Session(graph=g, config=config) as sess:
+        metadata = config_pb2.RunMetadata()
+        sess.run(result, run_metadata=metadata)
+
+    # Check that we optimized the graph by looking at the cost model: the add
+    # node should have been reconnected directly to the R1 and R2 nodes.
+    found_valid_nodes = 0
+    for node in metadata.cost_graph.node:
+      if node.name == 'R1':
+        r1_cost_id = node.id
+        found_valid_nodes += 1
+      if node.name == 'R2':
+        r2_cost_id = node.id
+        found_valid_nodes += 1
+      if node.name == 'add':
+        if node.input_info[0].preceding_node == r1_cost_id:
+          self.assertEqual(node.input_info[1].preceding_node, r2_cost_id)
+          found_valid_nodes += 1
+        elif node.input_info[0].preceding_node == r2_cost_id:
+          self.assertEqual(node.input_info[1].preceding_node, r1_cost_id)
+          found_valid_nodes += 1
+    self.assertEqual(3, found_valid_nodes)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 7c6f1cdd5e1ff7593b8a50dff0ca803acb05785d..902f02a256b59d23e2a8ca6f228ad1f89f0f9d75 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -156,35 +156,56 @@ tensorflow::ImportNumpy();
       reinterpret_cast<const char*>($1.data), $1.length);
 }
 
-// Include the functions from c_api.h, except TF_Run.
-%ignoreall
-%unignore TF_Code;
-%unignore TF_Status;
-%unignore TF_Buffer;
-%unignore TF_NewBuffer;
-%unignore TF_NewBufferFromString;
-%unignore TF_DeleteBuffer;
-%unignore TF_GetBuffer;
-%unignore TF_NewStatus;
-%unignore TF_DeleteStatus;
-%unignore TF_GetCode;
-%unignore TF_Message;
-%unignore TF_SessionOptions;
+%inline %{
+// Helper function to convert a Python list of Tensors to a C++ vector of
+// TF_Outputs.
+//
+// Caller should have already checked that `py_tensor_list` is a list (this
+// isn't done in this function to allow for function-specific error messages)
+void PyTensorListToVector(PyObject* py_tensor_list,
+                          std::vector<TF_Output>* vec) {
+  size_t size = PyList_Size(py_tensor_list);
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PyList_GetItem(py_tensor_list, i);
+    TF_Output* input_ptr;
+    SWIG_ConvertPtr(item, reinterpret_cast<void**>(&input_ptr),
+                    SWIGTYPE_p_TF_Output, 0);
+    vec->push_back(*input_ptr);
+  }
+}
+%}
+
+// Converts input Python list of wrapped TF_Outputs into a single array
+%typemap(in) (const TF_Output* inputs, int num_inputs)
+    (std::vector<TF_Output> inputs) {
+  if (!PyList_Check($input)) {
+    SWIG_exception_fail(
+        SWIG_TypeError, "$symname: expected Python list of wrapped TF_Outputs");
+  }
+  PyTensorListToVector($input, &inputs);
+  $1 = inputs.data();
+  $2 = inputs.size();
+}
+
+// TODO(skyewm): SWIG emits a warning for the const char* in TF_WhileParams,
+// skip for now
+%ignore TF_WhileParams;
+%ignore TF_NewWhile;
+%ignore TF_FinishWhile;
+%ignore TF_AbortWhile;
+
+// These are defined below, avoid duplicate definitions
+%ignore TF_Run;
+%ignore TF_PRun;
+%ignore TF_PRunSetup;
+
 %rename("_TF_SetTarget") TF_SetTarget;
 %rename("_TF_SetConfig") TF_SetConfig;
 %rename("_TF_NewSessionOptions") TF_NewSessionOptions;
-%unignore TF_DeleteSessionOptions;
-%unignore TF_NewDeprecatedSession;
-%unignore TF_CloseDeprecatedSession;
-%unignore TF_DeleteDeprecatedSession;
-%unignore TF_ExtendGraph;
-%unignore TF_NewLibrary;
-%unignore TF_LoadLibrary;
-%unignore TF_DeleteLibraryHandle;
-%unignore TF_GetOpList;
+
 %include "tensorflow/c/c_api.h"
-%ignoreall
 
+%ignoreall
 %insert("python") %{
   def TF_NewSessionOptions(target=None, config=None):
     # NOTE: target and config are validated in the session constructor.
diff --git a/tensorflow/python/debug/cli/cli_shared.py b/tensorflow/python/debug/cli/cli_shared.py
index 3deb6dbad652b73cfeafda79bec546716d6cd275..9164e18bcf582d55502b0f9530dfa51ea416b00e 100644
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@@ -44,6 +44,11 @@ COLOR_RED = "red"
 COLOR_WHITE = "white"
 COLOR_YELLOW = "yellow"
 
+TIME_UNIT_US = "us"
+TIME_UNIT_MS = "ms"
+TIME_UNIT_S = "s"
+TIME_UNITS = [TIME_UNIT_US, TIME_UNIT_MS, TIME_UNIT_S]
+
 
 def bytes_to_readable_str(num_bytes, include_b=False):
   """Generate a human-readable string representing number of bytes.
@@ -75,12 +80,32 @@ def bytes_to_readable_str(num_bytes, include_b=False):
   return result
 
 
-def time_to_readable_str(value):
-  if not value:
+def time_to_readable_str(value_us, force_time_unit=None):
+  """Convert time value to human-readable string.
+
+  Args:
+    value_us: time value in microseconds.
+    force_time_unit: force the output to use the specified time unit. Must be
+      in TIME_UNITS.
+
+  Returns:
+    Human-readable string representation of the time value.
+
+  Raises:
+    ValueError: if force_time_unit value is not in TIME_UNITS.
+  """
+  if not value_us:
     return "0"
-  suffixes = ["us", "ms", "s"]
-  order = min(len(suffixes) - 1, int(math.log(value, 10) / 3))
-  return "{:.3g}{}".format(value / math.pow(10.0, 3*order), suffixes[order])
+  if force_time_unit:
+    if force_time_unit not in TIME_UNITS:
+      raise ValueError("Invalid time unit: %s" % force_time_unit)
+    order = TIME_UNITS.index(force_time_unit)
+    time_unit = force_time_unit
+    return "{:.10g}{}".format(value_us / math.pow(10.0, 3*order), time_unit)
+  else:
+    order = min(len(TIME_UNITS) - 1, int(math.log(value_us, 10) / 3))
+    time_unit = TIME_UNITS[order]
+    return "{:.3g}{}".format(value_us / math.pow(10.0, 3*order), time_unit)
 
 
 def parse_ranges_highlight(ranges_string):
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index fde1d66998f969c3485ebd8b763dae0788310d60..647bbd5f0f29591a641028342d64b2786e37610c 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -84,6 +84,26 @@ class TimeToReadableStrTest(test_util.TensorFlowTestCase):
   def testSecondTime(self):
     self.assertEqual("40s", cli_shared.time_to_readable_str(40e6))
 
+  def testForceTimeUnit(self):
+    self.assertEqual("40s",
+                     cli_shared.time_to_readable_str(
+                         40e6, force_time_unit=cli_shared.TIME_UNIT_S))
+    self.assertEqual("40000ms",
+                     cli_shared.time_to_readable_str(
+                         40e6, force_time_unit=cli_shared.TIME_UNIT_MS))
+    self.assertEqual("40000000us",
+                     cli_shared.time_to_readable_str(
+                         40e6, force_time_unit=cli_shared.TIME_UNIT_US))
+    self.assertEqual("4e-05s",
+                     cli_shared.time_to_readable_str(
+                         40, force_time_unit=cli_shared.TIME_UNIT_S))
+    self.assertEqual("0",
+                     cli_shared.time_to_readable_str(
+                         0, force_time_unit=cli_shared.TIME_UNIT_S))
+
+    with self.assertRaisesRegexp(ValueError, r"Invalid time unit: ks"):
+      cli_shared.time_to_readable_str(100, force_time_unit="ks")
+
 
 class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/debug/cli/debugger_cli_common.py b/tensorflow/python/debug/cli/debugger_cli_common.py
index 64a22e6be4a9512ec22da465b4a26a9dc0b553f6..889fc6a8f64eea9d80c6bf427f9b29ed35311653 100644
--- a/tensorflow/python/debug/cli/debugger_cli_common.py
+++ b/tensorflow/python/debug/cli/debugger_cli_common.py
@@ -648,7 +648,7 @@ class CommandHandlerRegistry(object):
         3) the handler is found for the prefix, but it fails to return a
           RichTextLines or raise any exception.
       CommandLineExit:
-        If the command handler raises this type of exception, tihs method will
+        If the command handler raises this type of exception, this method will
         simply pass it along.
     """
     if not prefix:
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli.py b/tensorflow/python/debug/cli/profile_analyzer_cli.py
index 42440521eba9694d763cc1d76f85ce602007c5aa..3837717767e04e5c906d4063e413f1c6140304db 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli.py
@@ -50,43 +50,58 @@ class ProfileDatum(object):
     self.node_exec_stats = node_exec_stats
     self.file_line = file_line
     self.op_type = op_type
+    self.start_time = self.node_exec_stats.all_start_micros
     self.op_time = (self.node_exec_stats.op_end_rel_micros -
                     self.node_exec_stats.op_start_rel_micros)
 
   @property
   def exec_time(self):
-    """Measures compute function exection time plus pre- and post-processing."""
+    """Measures compute function execution time plus pre- and post-processing."""
     return self.node_exec_stats.all_end_rel_micros
 
 
 class ProfileDataTableView(object):
   """Table View of profiling data."""
 
-  def __init__(self, profile_datum_list):
+  def __init__(self, profile_datum_list, time_unit=cli_shared.TIME_UNIT_US):
     """Constructor.
 
     Args:
       profile_datum_list: List of `ProfileDatum` objects.
+      time_unit: must be in cli_shared.TIME_UNITS.
     """
     self._profile_datum_list = profile_datum_list
+    self.formatted_start_time = [
+        datum.start_time for datum in profile_datum_list]
     self.formatted_op_time = [
-        cli_shared.time_to_readable_str(datum.op_time)
+        cli_shared.time_to_readable_str(datum.op_time,
+                                        force_time_unit=time_unit)
         for datum in profile_datum_list]
     self.formatted_exec_time = [
         cli_shared.time_to_readable_str(
-            datum.node_exec_stats.all_end_rel_micros)
+            datum.node_exec_stats.all_end_rel_micros,
+            force_time_unit=time_unit)
         for datum in profile_datum_list]
-    self._column_sort_ids = [SORT_OPS_BY_OP_NAME, SORT_OPS_BY_OP_TIME,
-                             SORT_OPS_BY_EXEC_TIME, SORT_OPS_BY_LINE]
+
+    self._column_names = ["Node",
+                          "Start Time (us)",
+                          "Op Time (%s)" % time_unit,
+                          "Exec Time (%s)" % time_unit,
+                          "Filename:Lineno(function)"]
+    self._column_sort_ids = [SORT_OPS_BY_OP_NAME, SORT_OPS_BY_START_TIME,
+                             SORT_OPS_BY_OP_TIME, SORT_OPS_BY_EXEC_TIME,
+                             SORT_OPS_BY_LINE]
 
   def value(self, row, col):
     if col == 0:
       return self._profile_datum_list[row].node_exec_stats.node_name
     elif col == 1:
-      return self.formatted_op_time[row]
+      return self.formatted_start_time[row]
     elif col == 2:
-      return self.formatted_exec_time[row]
+      return self.formatted_op_time[row]
     elif col == 3:
+      return self.formatted_exec_time[row]
+    elif col == 4:
       return self._profile_datum_list[row].file_line
     else:
       raise IndexError("Invalid column index %d." % col)
@@ -95,10 +110,10 @@ class ProfileDataTableView(object):
     return len(self._profile_datum_list)
 
   def column_count(self):
-    return 4
+    return len(self._column_names)
 
   def column_names(self):
-    return ["Node", "Op Time", "Exec Time", "Filename:Lineno(function)"]
+    return self._column_names
 
   def column_sort_id(self, col):
     return self._column_sort_ids[col]
@@ -246,6 +261,12 @@ class ProfileAnalyzer(object):
         dest="reverse",
         action="store_true",
         help="sort the data in reverse (descending) order")
+    ap.add_argument(
+        "--time_unit",
+        dest="time_unit",
+        type=str,
+        default=cli_shared.TIME_UNIT_US,
+        help="Time unit (" + " | ".join(cli_shared.TIME_UNITS) + ")")
 
     self._arg_parsers["list_profile"] = ap
 
@@ -294,7 +315,7 @@ class ProfileAnalyzer(object):
         output.extend(
             self._get_list_profile_lines(
                 device_stats.device, index, device_count,
-                profile_data, parsed.sort_by, parsed.reverse))
+                profile_data, parsed.sort_by, parsed.reverse, parsed.time_unit))
     return output
 
   def _get_profile_data_generator(self):
@@ -328,7 +349,7 @@ class ProfileAnalyzer(object):
 
   def _get_list_profile_lines(
       self, device_name, device_index, device_count,
-      profile_datum_list, sort_by, sort_reverse):
+      profile_datum_list, sort_by, sort_reverse, time_unit):
     """Get `RichTextLines` object for list_profile command for a given device.
 
     Args:
@@ -341,20 +362,24 @@ class ProfileAnalyzer(object):
           SORT_OPS_BY_MEMORY or SORT_OPS_BY_LINE.
       sort_reverse: (bool) Whether to sort in descending instead of default
           (ascending) order.
+      time_unit: time unit, must be in cli_shared.TIME_UNITS.
 
     Returns:
       `RichTextLines` object containing a table that displays profiling
       information for each op.
     """
-    profile_data = ProfileDataTableView(profile_datum_list)
+    profile_data = ProfileDataTableView(profile_datum_list, time_unit=time_unit)
 
     # Calculate total time early to calculate column widths.
     total_op_time = sum(datum.op_time for datum in profile_datum_list)
     total_exec_time = sum(datum.node_exec_stats.all_end_rel_micros
                           for datum in profile_datum_list)
     device_total_row = [
-        "Device Total", cli_shared.time_to_readable_str(total_op_time),
-        cli_shared.time_to_readable_str(total_exec_time)]
+        "Device Total", "",
+        cli_shared.time_to_readable_str(total_op_time,
+                                        force_time_unit=time_unit),
+        cli_shared.time_to_readable_str(total_exec_time,
+                                        force_time_unit=time_unit)]
 
     # Calculate column widths.
     column_widths = [
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
index 6cc043e1cb61a1e4e7f635569ad8bf5b80267a65..6880e61d3e2962906bb7677145569bc077154712 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -130,7 +130,8 @@ class ProfileAnalyzerTest(test_util.TensorFlowTestCase):
 
       self._assertAtLeastOneLineMatches("Device 1 of", prof_output)
       expected_headers = [
-          "Node", "Op Time", "Exec Time", r"Filename:Lineno\(function\)"]
+          "Node", r"Start Time \(us\)", r"Op Time \(.*\)", r"Exec Time \(.*\)",
+          r"Filename:Lineno\(function\)"]
       self._assertAtLeastOneLineMatches(
           ".*".join(expected_headers), prof_output)
       self._assertAtLeastOneLineMatches(r"^Add/", prof_output)
@@ -242,10 +243,49 @@ class ProfileAnalyzerTest(test_util.TensorFlowTestCase):
     self._assertAtLeastOneLineMatches(r"Add/123", prof_output)
     self._assertNoLinesMatch(r"Mul/456", prof_output)
 
+  def testSpecifyingTimeUnit(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        all_start_micros=123,
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    node2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        all_start_micros=122,
+        op_start_rel_micros=1,
+        op_end_rel_micros=2,
+        all_end_rel_micros=5)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1, node2])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = "Add/123"
+    op1.traceback = [("a/b/file2", 10, "some_var")]
+    op1.type = "add"
+    op2 = test.mock.MagicMock()
+    op2.name = "Mul/456"
+    op2.traceback = [("a/b/file1", 11, "some_var")]
+    op2.type = "mul"
+    graph.get_operations.return_value = [op1, op2]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+
+    # Force time unit.
+    prof_output = prof_analyzer.list_profile(["--time_unit", "ms"]).lines
+    self._assertAtLeastOneLineMatches(r"Add/123.*0\.002ms", prof_output)
+    self._assertAtLeastOneLineMatches(r"Mul/456.*0\.005ms", prof_output)
+    self._assertAtLeastOneLineMatches(r"Device Total.*0\.009ms", prof_output)
+
   def _atLeastOneLineMatches(self, pattern, lines):
     pattern_re = re.compile(pattern)
     for line in lines:
-      if pattern_re.match(line):
+      if pattern_re.search(line):
         return True
     return False
 
diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
index 7830954f0560e646452bbbf3248143f11d5f822c..0b5401a7f29a3e979de157b47f952047cecd6a4e 100755
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -66,17 +66,18 @@ run
 exit
 EOF
 
-# Use a large enough "run -t" number to let the process end properly.
-cat << EOF | ${DEBUG_MNIST_BIN} --debug --fake_data --ui_type=readline
+cat << EOF | ${DEBUG_MNIST_BIN} --debug --max_steps=1 --fake_data --ui_type=readline
+run -t 1
+run --node_name_filter hidden --op_type_filter MatMul
 run -f has_inf_or_nan
-run -t 1000
 EOF
 
 # Test the custom dump_root option.
 CUSTOM_DUMP_ROOT=$(mktemp -d)
 mkdir -p ${CUSTOM_DUMP_ROOT}
 
-cat << EOF | ${DEBUG_TFLEARN_IRIS_BIN} --debug --fake_data --train_steps=1 --dump_root="${CUSTOM_DUMP_ROOT}" --ui_type=readline
+cat << EOF | ${DEBUG_TFLEARN_IRIS_BIN} --debug --fake_data --train_steps=2 --dump_root="${CUSTOM_DUMP_ROOT}" --ui_type=readline
+run -p
 run -f has_inf_or_nan
 EOF
 
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index ce4bc82e0a87a47f9c2f860a9dc5203ddf76aa3e..de1e1ce017d508f6df78124126fc0579b2bf5a5d 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -975,7 +975,7 @@ class DebugDumpDir(object):
       slot = datum.output_slot
       # In some cases (e.g., system clocks with insufficient precision),
       # the upstream and downstream tensors may have identical timestamps, the
-      # following check examines this possibilty and avoids raising an error if
+      # following check examines this possibility and avoids raising an error if
       # that is the case.
       if not self._satisfied_at_timestamp(
           pending_inputs[node], datum.timestamp, start_i=i + 1):
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 50645c1c874a147cfeb484313d3b58e57052ca2a..ea642adbd1d55fbfed4798fb777d79b527955d95 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -219,6 +219,9 @@ class OnRunStartAction(object):
   # Run once with debug tensor-watching.
   DEBUG_RUN = "debug_run"
 
+  # Run once with profiler.
+  PROFILE_RUN = "profile_run"
+
   # Run without debug tensor-watching.
   NON_DEBUG_RUN = "non_debug_run"
 
@@ -348,12 +351,6 @@ class BaseDebugWrapperSession(session.SessionInterface):
 
     _check_type(sess, session.BaseSession)
 
-    # TODO(cais): Remove this check once tfdbg is integrated with GrpcSession.
-    if sess.sess_str:
-      raise NotImplementedError(
-          "Non-DirectSession support is not available from TensorFlow "
-          "Debugger yet (sess_str=%s)" % sess.sess_str)
-
     # The session being wrapped.
     self._sess = sess
     self._thread_name_filter_pattern = (re.compile(thread_name_filter)
@@ -431,7 +428,7 @@ class BaseDebugWrapperSession(session.SessionInterface):
       decorated_run_options = options or config_pb2.RunOptions()
       run_metadata = run_metadata or config_pb2.RunMetadata()
 
-      self._decorate_run_options(
+      self._decorate_run_options_for_debug(
           decorated_run_options,
           run_start_resp.debug_urls,
           debug_ops=run_start_resp.debug_ops,
@@ -460,6 +457,19 @@ class BaseDebugWrapperSession(session.SessionInterface):
           client_graph_def=self._sess.graph.as_graph_def(),
           tf_error=tf_error)
 
+    elif run_start_resp.action == OnRunStartAction.PROFILE_RUN:
+      decorated_run_options = options or config_pb2.RunOptions()
+      run_metadata = run_metadata or config_pb2.RunMetadata()
+      self._decorate_run_options_for_profile(decorated_run_options)
+      retvals = self._sess.run(fetches,
+                               feed_dict=feed_dict,
+                               options=decorated_run_options,
+                               run_metadata=run_metadata)
+      run_end_req = OnRunEndRequest(
+          run_start_resp.action,
+          run_metadata=run_metadata,
+          client_graph_def=self._sess.graph.as_graph_def())
+
     elif (run_start_resp.action == OnRunStartAction.NON_DEBUG_RUN or
           run_start_resp.action == OnRunStartAction.INVOKE_STEPPER):
       if run_start_resp.action == OnRunStartAction.INVOKE_STEPPER:
@@ -502,14 +512,15 @@ class BaseDebugWrapperSession(session.SessionInterface):
     raise NotImplementedError(
         "partial_run is not implemented for debug-wrapper sessions.")
 
-  def _decorate_run_options(self,
-                            run_options,
-                            debug_urls,
-                            debug_ops="DebugIdentity",
-                            node_name_regex_whitelist=None,
-                            op_type_regex_whitelist=None,
-                            tensor_dtype_regex_whitelist=None,
-                            tolerate_debug_op_creation_failures=False):
+  def _decorate_run_options_for_debug(
+      self,
+      run_options,
+      debug_urls,
+      debug_ops="DebugIdentity",
+      node_name_regex_whitelist=None,
+      op_type_regex_whitelist=None,
+      tensor_dtype_regex_whitelist=None,
+      tolerate_debug_op_creation_failures=False):
     """Modify a RunOptions object for debug tensor watching.
 
     Specifies request for outputting partition graphs. Adds
@@ -540,6 +551,15 @@ class BaseDebugWrapperSession(session.SessionInterface):
         tensor_dtype_regex_whitelist=tensor_dtype_regex_whitelist,
         tolerate_debug_op_creation_failures=tolerate_debug_op_creation_failures)
 
+  def _decorate_run_options_for_profile(self, run_options):
+    """Modify a RunOptions object for profiling TensorFlow graph execution.
+
+    Args:
+      run_options: (RunOptions) the modified RunOptions object.
+    """
+
+    run_options.trace_level = config_pb2.RunOptions.FULL_TRACE
+
   @abc.abstractmethod
   def on_session_init(self, request):
     """Callback invoked during construction of the debug-wrapper session.
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 1d69c7769a2552a6a06f38d5f515efc09d707a28..fd0efcd925fadb22aed00129e04c74ce31cf2a2a 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -384,18 +384,6 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
         ["a_init", "b_init"],
         [datum.node_name for datum in dump.dumped_tensor_data])
 
-  def testUsingNonDirectSessionRaisesNotImplementedError(self):
-    # TODO(cais): Remove this test once tfdbg is integrated with GrpcSession.
-    fake_non_direct_session = session.Session()
-    fake_non_direct_session._target = "foo"
-
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        r"Non-DirectSession support is not available from TensorFlow Debugger "
-        r"yet \(sess_str=foo\)"):
-      TestDebugWrapperSession(
-          fake_non_direct_session, self._dump_root, self._observer)
-
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 65713f0b714a995191d9713716858f1815dbdc2f..f6194f5fad78bf809b65f827a60c45bfebb08ce4 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -66,7 +66,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
     """Add a tensor filter.
 
     See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-    Override default behavior to accomodate the possibility of this method being
+    Override default behavior to accommodate the possibility of this method being
     called prior to the initialization of the underlying
     `LocalCLIDebugWrapperSession` object.
 
@@ -130,6 +130,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
                   on_run_start_response.tensor_dtype_regex_whitelist),
               tolerate_debug_op_creation_failures=(
                   on_run_start_response.tolerate_debug_op_creation_failures)))
+    elif self._performed_action == framework.OnRunStartAction.PROFILE_RUN:
+      self._decorate_run_options_for_profile(run_args.options)
     elif self._performed_action == framework.OnRunStartAction.INVOKE_STEPPER:
       # The _finalized property must be set to False so that the NodeStepper
       # can insert ops for retrieving TensorHandles.
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index 0ba31f986be1ad5b9188345089b418b27b419227..fe822df6ce3ca21a26825abc8385f8c120f55b0c 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -27,6 +27,7 @@ import tempfile
 from tensorflow.python.debug.cli import analyzer_cli
 from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.cli import profile_analyzer_cli
 from tensorflow.python.debug.cli import stepper_cli
 from tensorflow.python.debug.cli import ui_factory
 from tensorflow.python.debug.lib import debug_data
@@ -162,6 +163,12 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         default="",
         help="Regular-expression filter for tensor dtype to be watched in the "
         "run, e.g., (float32|float64), int.*")
+    ap.add_argument(
+        "-p",
+        "--profile",
+        dest="profile",
+        action="store_true",
+        help="Run and profile TensorFlow graph execution.")
     self._argparsers["run"] = ap
 
     ap = argparse.ArgumentParser(
@@ -318,12 +325,16 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
           passed_filter = self._active_tensor_filter
           self._active_tensor_filter = None
 
-      self._prep_cli_for_run_end(debug_dump, request.tf_error, passed_filter)
+      self._prep_debug_cli_for_run_end(
+          debug_dump, request.tf_error, passed_filter)
 
       self._run_start_response = self._launch_cli()
 
       # Clean up the dump generated by this run.
       self._remove_dump_root()
+    elif request.performed_action == framework.OnRunStartAction.PROFILE_RUN:
+      self._prep_profile_cli_for_run_end(self._sess.graph, request.run_metadata)
+      self._run_start_response = self._launch_cli()
     else:
       # No debug information to show following a non-debug run() call.
       self._run_start_response = None
@@ -336,7 +347,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if os.path.isdir(self._dump_root):
       shutil.rmtree(self._dump_root)
 
-  def _prep_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
     """Prepare (but not launch) CLI for run-end, with debug dump from the run.
 
     Args:
@@ -391,6 +402,12 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if help_intro:
       self._run_cli.set_help_intro(help_intro)
 
+  def _prep_profile_cli_for_run_end(self, py_graph, run_metadata):
+    self._init_command = "lp"
+    self._run_cli = profile_analyzer_cli.create_profiler_ui(
+        py_graph, run_metadata, ui_type=self._ui_type)
+    self._title = "run-end (profiler mode): " + self._run_description
+
   def _launch_cli(self):
     """Launch the interactive command-line interface.
 
@@ -425,13 +442,18 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
   def _run_handler(self, args, screen_info=None):
     """Command handler for "run" command during on-run-start."""
 
-    _ = screen_info  # Currently unused.
+    del screen_info  # Currently unused.
 
     parsed = self._argparsers["run"].parse_args(args)
     parsed.node_name_filter = parsed.node_name_filter or None
     parsed.op_type_filter = parsed.op_type_filter or None
     parsed.tensor_dtype_filter = parsed.tensor_dtype_filter or None
 
+    if parsed.profile:
+      raise debugger_cli_common.CommandLineExit(
+          exit_token=framework.OnRunStartResponse(
+              framework.OnRunStartAction.PROFILE_RUN, []))
+
     if parsed.till_filter_pass:
       # For the run-till-bad-numerical-value-appears mode, use the DEBUG_RUN
       # option to access the intermediate tensors, and set the corresponding
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index e22f6e783e83bcb81a94c13e510346c3bfe3ec7f..595000fdb61917626ab6dbc32c4e007d48011d8d 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -71,15 +71,21 @@ class LocalCLIDebuggerWrapperSessionForTest(
         "tf_errors": [],
         "run_start_cli_run_numbers": [],
         "run_end_cli_run_numbers": [],
+        "profiler_py_graphs": [],
+        "profiler_run_metadata": [],
     }
 
   def _prep_cli_for_run_start(self):
     pass
 
-  def _prep_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
     self.observers["debug_dumps"].append(debug_dump)
     self.observers["tf_errors"].append(tf_error)
 
+  def _prep_profile_cli_for_run_end(self, py_graph, run_metadata):
+    self.observers["profiler_py_graphs"].append(py_graph)
+    self.observers["profiler_run_metadata"].append(run_metadata)
+
   def _launch_cli(self):
     if self._is_run_start:
       self.observers["run_start_cli_run_numbers"].append(self._run_call_count)
@@ -468,6 +474,19 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, dumps.size)
     self.assertEqual("w_int_inner", dumps.dumped_tensor_data[0].node_name)
 
+  def testRunUnderProfilerModeWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["-p"], []], self.sess)
+
+    wrapped_sess.run(self.w_int)
+
+    self.assertEqual(1, len(wrapped_sess.observers["profiler_run_metadata"]))
+    self.assertTrue(
+        wrapped_sess.observers["profiler_run_metadata"][0].step_stats)
+    self.assertEqual(1, len(wrapped_sess.observers["profiler_py_graphs"]))
+    self.assertIsInstance(
+        wrapped_sess.observers["profiler_py_graphs"][0], ops.Graph)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 52194973d21dc0a93a03323368d203d0249ec81a..cf76f8598b5556dacc2ab3137c0443832ce1f531 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -73,6 +73,17 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_test(
+    name = "run_config_test",
+    size = "small",
+    srcs = ["run_config_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":run_config",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_library(
     name = "estimator",
     srcs = [
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index d32caa8ac7dd950f72b470bdb0fe7de8289717f8..f32567b880866dc9c6a3d9608f997f16ea6f0a12 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -91,14 +91,18 @@ class Estimator(object):
 
     Args:
       model_fn: Model function. Follows the signature:
+
         * Args:
-          * `features`: single `Tensor` or `dict` of `Tensor`s
-                 (depending on data passed to `train`),
-          * `labels`: `Tensor` or `dict` of `Tensor`s (for multi-head
-                 models). If mode is `ModeKeys.PREDICT`, `labels=None` will be
-                 passed. If the `model_fn`'s signature does not accept
-                 `mode`, the `model_fn` must still be able to handle
-                 `labels=None`.
+
+          * `features`: This is the first item returned from the `input_fn`
+                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 single `Tensor` or `dict` of same.
+          * `labels`: This is the second item returned from the `input_fn`
+                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 single `Tensor` or `dict` of same (for multi-head models). If
+                 mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If
+                 the `model_fn`'s signature does not accept `mode`, the
+                 `model_fn` must still be able to handle `labels=None`.
           * `mode`: Optional. Specifies if this training, evaluation or
                  prediction. See `ModeKeys`.
           * `params`: Optional `dict` of hyperparameters.  Will receive what
@@ -111,6 +115,7 @@ class Estimator(object):
 
         * Returns:
           `EstimatorSpec`
+
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
         continue training a previously saved model. If `None`, the model_dir in
@@ -152,6 +157,8 @@ class Estimator(object):
       self._model_dir = tempfile.mkdtemp()
       logging.warning('Using temporary folder as model directory: %s',
                       self._model_dir)
+    if self._config.model_dir is None:
+      self._config = self._config.replace(model_dir=self._model_dir)
     logging.info('Using config: %s', str(vars(self._config)))
 
     if self._config.session_config is None:
@@ -732,7 +739,7 @@ def _model_fn_args(fn):
 
 def _verify_model_fn_args(model_fn, params):
   """Verifies model fn arguments."""
-  args = _model_fn_args(model_fn)
+  args = set(_model_fn_args(model_fn))
   if 'features' not in args:
     raise ValueError('model_fn (%s) must include features argument.' % model_fn)
   if 'labels' not in args:
@@ -745,7 +752,10 @@ def _verify_model_fn_args(model_fn, params):
     logging.warning('Estimator\'s model_fn (%s) includes params '
                     'argument, but params are not passed to Estimator.',
                     model_fn)
-  non_valid_args = list(set(args) - _VALID_MODEL_FN_ARGS)
+  if tf_inspect.ismethod(model_fn):
+    if 'self' in args:
+      args.remove('self')
+  non_valid_args = list(args - _VALID_MODEL_FN_ARGS)
   if non_valid_args:
     raise ValueError('model_fn (%s) has following not expected args: %s' %
                      (model_fn, non_valid_args))
@@ -807,13 +817,19 @@ def _write_dict_to_summary(output_dir,
   for key in dictionary:
     if dictionary[key] is None:
       continue
+    if key  == "global_step":
+      continue
     value = summary_proto.value.add()
     value.tag = key
-    if (isinstance(dictionary[key], np.float32) or
+    if (isinstance(dictionary[key], np.float32) or 
         isinstance(dictionary[key], float)):
       value.simple_value = float(dictionary[key])
+    elif (isinstance(dictionary[key], np.int64) or
+          isinstance(dictionary[key], np.int32) or
+          isinstance(dictionary[key], int)):
+      value.simple_value = int(dictionary[key])
     else:
-      logging.warn('Skipping summary for %s, must be a float or np.float32.',
+      logging.warn('Skipping summary for %s, must be a float, np.float32, np.int64, np.int32 or int.',
                    key)
   summary_writer.add_summary(summary_proto, current_global_step)
   summary_writer.flush()
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 77f7bd36ef6edc26ad200f757c3a63cc8d20f0a2..037357a96147634fd87220afb353ec8da6c7c6ba 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -42,8 +42,8 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import state_ops
@@ -152,8 +152,10 @@ class EstimatorConstructorTest(test.TestCase):
     def model_fn(features, labels):
       _, _ = features, labels
 
-    est = estimator.Estimator(model_fn=model_fn)
-    self.assertTrue(est.model_dir is not None)
+    with test.mock.patch.object(tempfile, 'mkdtemp', return_value=_TMP_DIR):
+      est = estimator.Estimator(model_fn=model_fn)
+      self.assertEqual(_TMP_DIR, est.config.model_dir)
+      self.assertEqual(_TMP_DIR, est.model_dir)
 
   def test_model_dir_in_constructor(self):
 
@@ -161,6 +163,7 @@ class EstimatorConstructorTest(test.TestCase):
       _, _ = features, labels
 
     est = estimator.Estimator(model_fn=model_fn, model_dir=_TMP_DIR)
+    self.assertEqual(_TMP_DIR, est.config.model_dir)
     self.assertEqual(_TMP_DIR, est.model_dir)
 
   def test_model_dir_in_run_config(self):
@@ -175,6 +178,7 @@ class EstimatorConstructorTest(test.TestCase):
       _, _ = features, labels
 
     est = estimator.Estimator(model_fn=model_fn, config=FakeConfig())
+    self.assertEqual(_TMP_DIR, est.config.model_dir)
     self.assertEqual(_TMP_DIR, est.model_dir)
 
   def test_same_model_dir_in_constructor_and_run_config(self):
@@ -190,6 +194,7 @@ class EstimatorConstructorTest(test.TestCase):
 
     est = estimator.Estimator(
         model_fn=model_fn, config=FakeConfig(), model_dir=_TMP_DIR)
+    self.assertEqual(_TMP_DIR, est.config.model_dir)
     self.assertEqual(_TMP_DIR, est.model_dir)
 
   def test_different_model_dir_in_constructor_and_run_config(self):
@@ -251,6 +256,17 @@ class EstimatorConstructorTest(test.TestCase):
         features, labels, 'something')
     estimator.Estimator(model_fn=new_model_fn)
 
+  def test_if_model_fn_is_a_member_function_of_a_class(self):
+
+    class ModelFnClass(object):
+      def __init__(self):
+        estimator.Estimator(model_fn=self.model_fn)
+
+      def model_fn(self, features, labels, mode):
+        _, _, _ = features, labels, mode
+
+    ModelFnClass()
+
 
 def dummy_input_fn():
   return ({'x': constant_op.constant([[1], [1]])},
@@ -436,13 +452,13 @@ class EstimatorTrainTest(test.TestCase):
         model_dir=model_dir1,
         model_fn=model_fn_global_step_incrementer)
     est1.train(dummy_input_fn, steps=5)
-    
+
     # We have to clear the cache before we can rename the directory,
     # otherwise open file handles will prevent the delete on Windows.
     writer_cache.FileWriterCache.clear()
     model_dir2 = os.path.join(tmpdir, 'model_dir2')
     os.renames(model_dir1, model_dir2)
-    
+
     est2 = estimator.Estimator(
         model_dir=model_dir2,
         model_fn=model_fn_global_step_incrementer)
@@ -545,6 +561,8 @@ class EstimatorTrainTest(test.TestCase):
     # Mocking the SessionManager.wait_for_session, so that worker doesn't wait
     # for chief.
     def get_initialized_session(*args, **kwargs):
+      # Session doesn't take 'max_wait_secs' argument.
+      kwargs.pop('max_wait_secs', None)
       scaffold = training.Scaffold().finalize()
       sess = session.Session(*args, **kwargs)
       sess.run(scaffold.init_op)
@@ -1394,9 +1412,10 @@ class EstimatorExportTest(test.TestCase):
       my_int = variables.Variable(1, name='my_int',
                                   collections=[ops.GraphKeys.LOCAL_VARIABLES])
       scores = constant_op.constant([3.])
-      with ops.control_dependencies(
-          [variables.local_variables_initializer(),
-           data_flow_ops.tables_initializer()]):
+      with ops.control_dependencies([
+          variables.local_variables_initializer(),
+          lookup_ops.tables_initializer()
+      ]):
         assign_op = state_ops.assign(my_int, 12345)
 
       # local_initSop must be an Operation, not a Tensor.
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 37a98cf481521cc571e60c264b0980077ee01580..a1ecd794df6f114483a1ce4eeacd6fcbd4634392 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -23,6 +23,8 @@ import collections
 import os
 import time
 
+import six
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -56,7 +58,7 @@ class ServingInputReceiver(collections.namedtuple('ServingInputReceiver',
     if not isinstance(features, dict):
       features = {_SINGLE_FEATURE_DEFAULT_NAME: features}
     for name, tensor in features.items():
-      if not isinstance(name, str):
+      if not isinstance(name, six.string_types):
         raise ValueError('feature keys must be strings: {}.'.format(name))
       if not (isinstance(tensor, ops.Tensor)
               or isinstance(tensor, sparse_tensor.SparseTensor)):
@@ -68,7 +70,7 @@ class ServingInputReceiver(collections.namedtuple('ServingInputReceiver',
     if not isinstance(receiver_tensors, dict):
       receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
     for name, tensor in receiver_tensors.items():
-      if not isinstance(name, str):
+      if not isinstance(name, six.string_types):
         raise ValueError(
             'receiver_tensors keys must be strings: {}.'.format(name))
       if not isinstance(tensor, ops.Tensor):
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index 69be0f687c17fb85a2a1f2830360af284006afdf..49bcd06d504bc4b1faa4920b87ebe92510190731 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -171,7 +173,7 @@ class PredictOutput(ExportOutput):
           'Prediction outputs must be given as a dict of string to Tensor; '
           'got {}'.format(outputs))
     for key, value in outputs.items():
-      if not isinstance(key, str):
+      if not isinstance(key, six.string_types):
         raise ValueError(
             'Prediction output key must be a string; got {}.'.format(key))
       if not isinstance(value, ops.Tensor):
diff --git a/tensorflow/python/estimator/export/export_output_test.py b/tensorflow/python/estimator/export/export_output_test.py
index 27a088e551c25062a770f44acb4b3b907b880051..035a9a143e6ffa18ae78ef2544614f342363b22d 100644
--- a/tensorflow/python/estimator/export/export_output_test.py
+++ b/tensorflow/python/estimator/export/export_output_test.py
@@ -22,7 +22,9 @@ from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.estimator.export import export_output as export_output_lib
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
@@ -197,6 +199,33 @@ class ExportOutputTest(test.TestCase):
         signature_constants.CLASSIFY_METHOD_NAME)
     self.assertEqual(actual_signature_def, expected_signature_def)
 
+  def test_predict_output_constructor(self):
+    """Tests that no errors are raised when input is expected."""
+    outputs = {
+        "output0": constant_op.constant([0]),
+        u"output1": constant_op.constant([1]),
+    }
+    export_output_lib.PredictOutput(outputs)
+
+  def test_predict_output_outputs_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Prediction outputs must be given as a dict of string to Tensor"):
+      export_output_lib.PredictOutput(constant_op.constant([0]))
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Prediction output key must be a string"):
+      export_output_lib.PredictOutput({1: constant_op.constant([0])})
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Prediction output value must be a Tensor"):
+      export_output_lib.PredictOutput({
+          "prediction1": sparse_tensor.SparseTensor(
+              indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+      })
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index fdd924f2e1cc4936c9655c0b26b85e6f954f016d..7946bd88ba0b577fb9f4885b80829cec6f26c919 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -28,13 +28,11 @@ from tensorflow.core.example import example_pb2
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.export import export_output
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
@@ -43,6 +41,69 @@ from tensorflow.python.saved_model import signature_def_utils
 
 class ExportTest(test_util.TensorFlowTestCase):
 
+  def test_serving_input_receiver_constructor(self):
+    """Tests that no errors are raised when input is expected."""
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    export.ServingInputReceiver(features, receiver_tensors)
+
+  def test_serving_input_receiver_features_invalid(self):
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+
+    with self.assertRaisesRegexp(ValueError, "features must be defined"):
+      export.ServingInputReceiver(
+          features=None,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(ValueError, "feature keys must be strings"):
+      export.ServingInputReceiver(
+          features={1: constant_op.constant([1])},
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(
+        ValueError, "feature feature1 must be a Tensor or SparseTensor"):
+      export.ServingInputReceiver(
+          features={"feature1": [1]},
+          receiver_tensors=receiver_tensors)
+
+  def test_serving_input_receiver_receiver_tensors_invalid(self):
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors must be defined"):
+      export.ServingInputReceiver(
+          features=features,
+          receiver_tensors=None)
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors keys must be strings"):
+      export.ServingInputReceiver(
+          features=features,
+          receiver_tensors={
+              1: array_ops.placeholder(dtypes.string, name="example0")})
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensor example1 must be a Tensor"):
+      export.ServingInputReceiver(
+          features=features,
+          receiver_tensors={"example1": [1]})
+
   def test_single_feature_single_receiver(self):
     feature = constant_op.constant(5)
     receiver_tensor = array_ops.placeholder(dtypes.string)
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 504bf5c3fe56637876d4fde5e2fb6a71a77333e7..30ba18d07dbb804297fbe6d668abf91756867086 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -18,6 +18,81 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
+import six
+
+from tensorflow.core.protobuf import config_pb2
+
+
+# A list of the property names in RunConfig user allows to change.
+_DEFAULT_REPLACEABLE_LIST = [
+    'model_dir',
+    'tf_random_seed',
+    'save_summary_steps',
+    'save_checkpoints_steps',
+    'save_checkpoints_secs',
+    'session_config',
+    'keep_checkpoint_max',
+    'keep_checkpoint_every_n_hours',
+]
+
+_SAVE_CKPT_ERR = (
+    '`save_checkpoints_steps` and `save_checkpoints_secs` cannot be both set.'
+)
+
+
+def _validate_save_ckpt_with_replaced_keys(new_copy, replaced_keys):
+  """Validates the save ckpt properties."""
+  # Ensure one (and only one) of save_steps and save_secs is not None.
+  # Also, if user sets one save ckpt property, say steps, the other one (secs)
+  # should be set as None to improve usability.
+
+  save_steps = new_copy.save_checkpoints_steps
+  save_secs = new_copy.save_checkpoints_secs
+
+  if ('save_checkpoints_steps' in replaced_keys and
+      'save_checkpoints_secs' in replaced_keys):
+    # If user sets both properties explicitly, we need to error out if both
+    # are set or neither of them are set.
+    if save_steps is not None and save_secs is not None:
+      raise ValueError(_SAVE_CKPT_ERR)
+  elif 'save_checkpoints_steps' in replaced_keys and save_steps is not None:
+    new_copy._save_checkpoints_secs = None  # pylint: disable=protected-access
+  elif 'save_checkpoints_secs' in replaced_keys and save_secs is not None:
+    new_copy._save_checkpoints_steps = None  # pylint: disable=protected-access
+
+
+def _validate_properties(run_config):
+  """Validates the properties."""
+  def _validate(property_name, cond, message):
+    property_value = getattr(run_config, property_name)
+    if property_value is not None and not cond(property_value):
+      raise ValueError(message)
+
+  _validate('model_dir', lambda dir: dir,
+            message='model_dir should be non-empty')
+
+  _validate('save_summary_steps', lambda steps: steps >= 0,
+            message='save_summary_steps should be >= 0')
+
+  _validate('save_checkpoints_steps', lambda steps: steps >= 0,
+            message='save_checkpoints_steps should be >= 0')
+  _validate('save_checkpoints_secs', lambda secs: secs >= 0,
+            message='save_checkpoints_secs should be >= 0')
+
+  _validate('session_config',
+            lambda sc: isinstance(sc, config_pb2.ConfigProto),
+            message='session_config must be instance of ConfigProto')
+
+  _validate('keep_checkpoint_max', lambda keep_max: keep_max >= 0,
+            message='keep_checkpoint_max should be >= 0')
+  _validate('keep_checkpoint_every_n_hours', lambda keep_hours: keep_hours > 0,
+            message='keep_checkpoint_every_n_hours should be > 0')
+
+  _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types),
+            message='tf_random_seed must be integer.')
+
 
 class TaskType(object):
   MASTER = 'master'
@@ -28,6 +103,17 @@ class TaskType(object):
 class RunConfig(object):
   """This class specifies the configurations for an `Estimator` run."""
 
+  def __init__(self):
+    self._model_dir = None
+    self._tf_random_seed = 1
+    self._save_summary_steps = 100
+    self._save_checkpoints_secs = 600
+    self._save_checkpoints_steps = None
+    self._session_config = None
+    self._keep_checkpoint_max = 5
+    self._keep_checkpoint_every_n_hours = 10000
+    _validate_properties(self)
+
   @property
   def cluster_spec(self):
     return None
@@ -62,32 +148,98 @@ class RunConfig(object):
 
   @property
   def tf_random_seed(self):
-    return 1
+    return self._tf_random_seed
 
   @property
   def save_summary_steps(self):
-    return 100
+    return self._save_summary_steps
 
   @property
   def save_checkpoints_secs(self):
-    return 600
+    return self._save_checkpoints_secs
 
   @property
   def session_config(self):
-    return None
+    return self._session_config
 
   @property
   def save_checkpoints_steps(self):
-    return None
+    return self._save_checkpoints_steps
 
   @property
   def keep_checkpoint_max(self):
-    return 5
+    return self._keep_checkpoint_max
 
   @property
   def keep_checkpoint_every_n_hours(self):
-    return 10000
+    return self._keep_checkpoint_every_n_hours
 
   @property
   def model_dir(self):
-    return None
+    return self._model_dir
+
+  def replace(self, **kwargs):
+    """Returns a new instance of `RunConfig` replacing specified properties.
+
+    Only the properties in the following list are allowed to be replaced:
+      - `model_dir`.
+      - `tf_random_seed`,
+      - `save_summary_steps`,
+      - `save_checkpoints_steps`,
+      - `save_checkpoints_secs`,
+      - `session_config`,
+      - `keep_checkpoint_max`,
+      - `keep_checkpoint_every_n_hours`,
+
+    In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
+    can be set (should not be both).
+
+    Args:
+      **kwargs: keyword named properties with new values.
+
+    Raises:
+      ValueError: If any property name in `kwargs` does not exist or is not
+        allowed to be replaced, or both `save_checkpoints_steps` and
+        `save_checkpoints_secs` are set.
+
+    Returns:
+      a new instance of `RunConfig`.
+    """
+    return self._replace(
+        allowed_properties_list=_DEFAULT_REPLACEABLE_LIST, **kwargs)
+
+  def _replace(self, allowed_properties_list=None, **kwargs):
+    """See `replace`.
+
+    N.B.: This implementation assumes that for key named "foo", the underlying
+    property the RunConfig holds is "_foo" (with one leading underscore).
+
+    Args:
+      allowed_properties_list: The property name list allowed to be replaced.
+      **kwargs: keyword named properties with new values.
+
+    Raises:
+      ValueError: If any property name in `kwargs` does not exist or is not
+        allowed to be replaced, or both `save_checkpoints_steps` and
+        `save_checkpoints_secs` are set.
+
+    Returns:
+      a new instance of `RunConfig`.
+    """
+
+    new_copy = copy.deepcopy(self)
+
+    allowed_properties_list = allowed_properties_list or []
+
+    for key, new_value in six.iteritems(kwargs):
+      if key in allowed_properties_list:
+        setattr(new_copy, '_' + key, new_value)
+        continue
+
+      raise ValueError(
+          'Replacing {} is not supported. Allowed properties are {}.'.format(
+              key, allowed_properties_list))
+
+    _validate_save_ckpt_with_replaced_keys(new_copy, kwargs.keys())
+    _validate_properties(new_copy)
+    return new_copy
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d5b2a3f64845fa0f60567e73b346688c482703
--- /dev/null
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -0,0 +1,183 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RunConfig tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.platform import test
+
+_TEST_DIR = 'test_dir'
+_MASTER = 'master_'
+_NOT_SUPPORTED_REPLACE_PROPERTY_MSG = 'Replacing .*is not supported'
+_SAVE_CKPT_ERR = (
+    '`save_checkpoints_steps` and `save_checkpoints_secs` cannot be both set.'
+)
+_MODEL_DIR_ERR = 'model_dir should be non-empty'
+_SAVE_SUMMARY_STEPS_ERR = 'save_summary_steps should be >= 0'
+_SAVE_CKPT_STEPS_ERR = 'save_checkpoints_steps should be >= 0'
+_SAVE_CKPT_SECS_ERR = 'save_checkpoints_secs should be >= 0'
+_SESSION_CONFIG_ERR = 'session_config must be instance of ConfigProto'
+_KEEP_CKPT_MAX_ERR = 'keep_checkpoint_max should be >= 0'
+_KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0'
+_TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer'
+
+
+class RunConfigTest(test.TestCase):
+
+  def test_default_property_values(self):
+    config = run_config_lib.RunConfig()
+    self.assertIsNone(config.model_dir)
+    self.assertIsNone(config.session_config)
+    self.assertEqual(1, config.tf_random_seed)
+    self.assertEqual(100, config.save_summary_steps)
+    self.assertEqual(600, config.save_checkpoints_secs)
+    self.assertIsNone(config.save_checkpoints_steps)
+    self.assertEqual(5, config.keep_checkpoint_max)
+    self.assertEqual(10000, config.keep_checkpoint_every_n_hours)
+
+  def test_model_dir(self):
+    empty_config = run_config_lib.RunConfig()
+    self.assertIsNone(empty_config.model_dir)
+
+    new_config = empty_config.replace(model_dir=_TEST_DIR)
+    self.assertEqual(_TEST_DIR, new_config.model_dir)
+
+  def test_replace_with_allowed_properties(self):
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+
+    config = run_config_lib.RunConfig().replace(
+        tf_random_seed=11,
+        save_summary_steps=12,
+        save_checkpoints_secs=14,
+        session_config=session_config,
+        keep_checkpoint_max=16,
+        keep_checkpoint_every_n_hours=17)
+    self.assertEqual(11, config.tf_random_seed)
+    self.assertEqual(12, config.save_summary_steps)
+    self.assertEqual(14, config.save_checkpoints_secs)
+    self.assertEqual(session_config, config.session_config)
+    self.assertEqual(16, config.keep_checkpoint_max)
+    self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+
+  def test_replace_none_value(self):
+    config = run_config_lib.RunConfig().replace(
+        tf_random_seed=None,
+        model_dir=None,
+        save_summary_steps=None,
+        save_checkpoints_secs=None,
+        save_checkpoints_steps=None,
+        session_config=None,
+        keep_checkpoint_max=None,
+        keep_checkpoint_every_n_hours=None)
+    self.assertIsNone(config.tf_random_seed)
+    self.assertIsNone(config.model_dir)
+    self.assertIsNone(config.save_summary_steps)
+    self.assertIsNone(config.save_checkpoints_secs)
+    self.assertIsNone(config.save_checkpoints_steps)
+    self.assertIsNone(config.session_config)
+    self.assertIsNone(config.keep_checkpoint_max)
+    self.assertIsNone(config.keep_checkpoint_every_n_hours)
+
+  def test_replace_with_disallowallowed_properties(self):
+    config = run_config_lib.RunConfig()
+    with self.assertRaises(ValueError):
+      # tf_random_seed is not allowed to be replaced.
+      config.replace(master='_master')
+    with self.assertRaises(ValueError):
+      config.replace(some_undefined_property=123)
+
+  def test_replace(self):
+    config = run_config_lib.RunConfig()
+
+    with self.assertRaisesRegexp(
+        ValueError, _NOT_SUPPORTED_REPLACE_PROPERTY_MSG):
+      # master is not allowed to be replaced.
+      config.replace(master=_MASTER)
+
+    with self.assertRaisesRegexp(
+        ValueError, _NOT_SUPPORTED_REPLACE_PROPERTY_MSG):
+      config.replace(some_undefined_property=_MASTER)
+
+  def test_replace_invalid_values(self):
+    config = run_config_lib.RunConfig()
+
+    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
+      config.replace(model_dir='')
+    with self.assertRaisesRegexp(ValueError, _SAVE_SUMMARY_STEPS_ERR):
+      config.replace(save_summary_steps=-1)
+    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_STEPS_ERR):
+      config.replace(save_checkpoints_steps=-1)
+    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_SECS_ERR):
+      config.replace(save_checkpoints_secs=-1)
+    with self.assertRaisesRegexp(ValueError, _SESSION_CONFIG_ERR):
+      config.replace(session_config={})
+    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_MAX_ERR):
+      config.replace(keep_checkpoint_max=-1)
+    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_HOURS_ERR):
+      config.replace(keep_checkpoint_every_n_hours=0)
+    with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
+      config.replace(tf_random_seed=1.0)
+
+
+class RunConfigSaveCheckpointsTest(test.TestCase):
+
+  def test_save_checkpoint(self):
+    empty_config = run_config_lib.RunConfig()
+    self.assertEqual(600, empty_config.save_checkpoints_secs)
+    self.assertIsNone(empty_config.save_checkpoints_steps)
+
+    config_with_steps = empty_config.replace(save_checkpoints_steps=100)
+    del empty_config
+    self.assertEqual(100, config_with_steps.save_checkpoints_steps)
+    self.assertIsNone(config_with_steps.save_checkpoints_secs)
+
+    config_with_secs = config_with_steps.replace(save_checkpoints_secs=200)
+    del config_with_steps
+    self.assertEqual(200, config_with_secs.save_checkpoints_secs)
+    self.assertIsNone(config_with_secs.save_checkpoints_steps)
+
+  def test_save_checkpoint_both_steps_and_secs_are_not_none(self):
+    empty_config = run_config_lib.RunConfig()
+    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_ERR):
+      empty_config.replace(save_checkpoints_steps=100,
+                           save_checkpoints_secs=200)
+
+  def test_save_checkpoint_both_steps_and_secs_are_none(self):
+    config_with_secs = run_config_lib.RunConfig()
+    config_without_ckpt = config_with_secs.replace(
+        save_checkpoints_steps=None, save_checkpoints_secs=None)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
+
+  def test_save_checkpoint_flip_secs_to_none(self):
+    config_with_secs = run_config_lib.RunConfig()
+    config_without_ckpt = config_with_secs.replace(save_checkpoints_secs=None)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
+
+  def test_save_checkpoint_flip_steps_to_none(self):
+    config_with_steps = run_config_lib.RunConfig().replace(
+        save_checkpoints_steps=100)
+    config_without_ckpt = config_with_steps.replace(save_checkpoints_steps=None)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e86e4dcab1ac6fc67be9efc0e36ba1a431175bc3
--- /dev/null
+++ b/tensorflow/python/feature_column/BUILD
@@ -0,0 +1,84 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "feature_column_py",
+    srcs = ["feature_column_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_column",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "feature_column",
+    srcs = ["feature_column.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+filegroup(
+    name = "vocabulary_testdata",
+    srcs = [
+        "testdata/embedding.ckpt.data-00000-of-00001",
+        "testdata/embedding.ckpt.index",
+        "testdata/embedding.ckpt.meta",
+        "testdata/warriors_vocabulary.txt",
+        "testdata/wire_vocabulary.txt",
+    ],
+)
+
+py_test(
+    name = "feature_column_test",
+    srcs = ["feature_column_test.py"],
+    data = [":vocabulary_testdata"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
diff --git a/.gitmodules b/tensorflow/python/feature_column/__init__.py
similarity index 100%
rename from .gitmodules
rename to tensorflow/python/feature_column/__init__.py
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
new file mode 100644
index 0000000000000000000000000000000000000000..61c02a2fc64449fbee0b86d5224081f53d0e6c7d
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -0,0 +1,2412 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This API defines FeatureColumn abstraction.
+
+FeatureColumns provide a high level abstraction for ingesting and representing
+features. FeatureColumns are also the primary way of encoding features for
+canned ${tf.estimator.Estimator}s.
+
+When using FeatureColumns with `Estimators`, the type of feature column you
+should choose depends on (1) the feature type and (2) the model type.
+
+1. Feature type:
+
+  * Continuous features can be represented by `numeric_column`.
+  * Categorical features can be represented by any `categorical_column_with_*`
+  column:
+    - `categorical_column_with_vocabulary_list`
+    - `categorical_column_with_vocabulary_file`
+    - `categorical_column_with_hash_bucket`
+    - `categorical_column_with_identity`
+    - `weighted_categorical_column`
+
+2. Model type:
+
+  * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
+
+    Continuous features can be directly fed into deep neural network models.
+
+      age_column = numeric_column("age")
+
+    To feed sparse features into DNN models, wrap the column with
+    `embedding_column` or `indicator_column`. `indicator_column` is recommended
+    for features with only a few possible values. For features with many
+    possible values, to reduce the size of your model, `embedding_column` is
+    recommended.
+
+      embedded_dept_column = embedding_column(
+          categorical_column_with_vocabulary_list(
+              "department", ["math", "philosphy", ...]), dimension=10)
+
+  * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
+
+    Sparse features can be fed directly into linear models. They behave like an
+    indicator column but with an efficient implementation.
+
+      dept_column = categorical_column_with_vocabulary_list("department",
+          ["math", "philosophy", "english"])
+
+    It is recommended that continuous features be bucketized before being
+    fed into linear models.
+
+      bucketized_age_column = bucketized_column(
+          source_column=age_column,
+          boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
+
+    Sparse features can be crossed (also known as conjuncted or combined) in
+    order to form non-linearities, and then fed into linear models.
+
+      cross_dept_age_column = crossed_column(
+          columns=["department", bucketized_age_column],
+          hash_bucket_size=1000)
+
+Example of building canned `Estimator`s using FeatureColumns:
+
+  ```python
+  # Define features and transformations
+  deep_feature_columns = [age_column, embedded_dept_column]
+  wide_feature_columns = [dept_column, bucketized_age_column,
+      cross_dept_age_column]
+
+  # Build deep model
+  estimator = DNNClassifier(
+      feature_columns=deep_feature_columns,
+      hidden_units=[500, 250, 50])
+  estimator.train(...)
+
+  # Or build a wide model
+  estimator = LinearClassifier(
+      feature_columns=wide_feature_columns)
+  estimator.train(...)
+
+  # Or build a wide and deep model!
+  estimator = DNNLinearCombinedClassifier(
+      linear_feature_columns=wide_feature_columns,
+      dnn_feature_columns=deep_feature_columns,
+      dnn_hidden_units=[500, 250, 50])
+  estimator.train(...)
+  ```
+
+
+FeatureColumns can also be transformed into a generic input layer for
+custom models using `input_layer`.
+
+Example of building model using FeatureColumns, this can be used in a
+`model_fn` which is given to the {tf.estimator.Estimator}:
+
+  ```python
+  # Building model via layers
+
+  deep_feature_columns = [age_column, embedded_dept_column]
+  columns_to_tensor = parse_feature_columns_from_examples(
+      serialized=my_data,
+      feature_columns=deep_feature_columns)
+  first_layer = input_layer(
+      features=columns_to_tensor,
+      feature_columns=deep_feature_columns)
+  second_layer = fully_connected(first_layer, ...)
+  ```
+
+NOTE: Functions prefixed with "_" indicate experimental or private parts of
+the API subject to change, and should not be relied upon!
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+import math
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.util import nest
+
+
+def input_layer(features,
+                feature_columns,
+                weight_collections=None,
+                trainable=True):
+  """Returns a dense `Tensor` as input layer based on given `feature_columns`.
+
+  Generally a single example in training data is described with FeatureColumns.
+  At the first layer of the model, this column oriented data should be converted
+  to a single `Tensor`.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  keywords_embedded = embedding_column(
+      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
+  columns = [price, keywords_embedded, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  for units in [128, 64, 32]:
+    dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
+  prediction = tf.layers.dense(dense_tensor, 1)
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
+      corresponding `_FeatureColumn`.
+    feature_columns: An iterable containing the FeatureColumns to use as inputs
+      to your model. All items should be instances of classes derived from
+      `_DenseColumn` such as `numeric_column`, `embedding_column`,
+      `bucketized_column`, `indicator_column`. If you have categorical features,
+      you can wrap them with an `embedding_column` or `indicator_column`.
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that, variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+
+  Returns:
+    A `Tensor` which represents input layer of a model. Its shape
+    is (batch_size, first_layer_dimension) and its dtype is `float32`.
+    first_layer_dimension is determined based on given `feature_columns`.
+
+  Raises:
+    ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
+  """
+  _check_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, _DenseColumn):
+      raise ValueError(
+          'Items of feature_columns must be a _DenseColumn. '
+          'You can wrap a categorical column with an '
+          'embedding_column or indicator_column. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+  with variable_scope.variable_scope(
+      None, default_name='input_layer', values=features.values()):
+    builder = _LazyBuilder(features)
+    output_tensors = []
+    ordered_columns = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(None, default_name=column.name):
+        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+        batch_size = array_ops.shape(tensor)[0]
+        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+        output_tensors.append(tensor)
+    _verify_static_batch_size_equality(output_tensors, ordered_columns)
+    return array_ops.concat(output_tensors, 1)
+
+
+def linear_model(features,
+                 feature_columns,
+                 units=1,
+                 sparse_combiner='sum',
+                 weight_collections=None,
+                 trainable=True):
+  """Returns a linear prediction `Tensor` based on given `feature_columns`.
+
+  This function generates a weighted sum based on output dimension `units`.
+  Weighted sum refers to logits in classification problems. It refers to the
+  prediction itself for linear regression problems.
+
+  Note on supported columns: `linear_model` treats categorical columns as
+  `indicator_column`s while `input_layer` explicitly requires wrapping each
+  of them with an `embedding_column` or an `indicator_column`.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
+  keywords = categorical_column_with_hash_bucket("keywords", 10K)
+  keywords_price = crossed_column('keywords', price_buckets, ...)
+  columns = [price_buckets, keywords, keywords_price ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values are `Tensor` or `SparseTensor` depending on
+      corresponding `_FeatureColumn`.
+    feature_columns: An iterable containing the FeatureColumns to use as inputs
+      to your model. All items should be instances of classes derived from
+      `_FeatureColumn`s.
+    units: An integer, dimensionality of the output space. Default value is 1.
+    sparse_combiner: A string specifying how to reduce if a sparse column is
+      multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
+      the default. "sqrtn" often achieves good accuracy, in particular with
+      bag-of-words columns. It combines each sparse columns independently.
+        * "sum": do not normalize features in the column
+        * "mean": do l1 normalization on features in the column
+        * "sqrtn": do l2 normalization on features in the column
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that, variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+
+  Returns:
+    A `Tensor` which represents predictions/logits of a linear model. Its shape
+    is (batch_size, units) and its dtype is `float32`.
+
+  Raises:
+    ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
+      nor `_CategoricalColumn`.
+  """
+  _check_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
+      raise ValueError('Items of feature_columns must be either a _DenseColumn '
+                       'or _CategoricalColumn. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+  with variable_scope.variable_scope(
+      None, default_name='linear_model', values=features.values()):
+    weighted_sums = []
+    ordered_columns = []
+    builder = _LazyBuilder(features)
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      with variable_scope.variable_scope(None, default_name=column.name):
+        ordered_columns.append(column)
+        if isinstance(column, _CategoricalColumn):
+          weighted_sums.append(_create_categorical_column_weighted_sum(
+              column, builder, units, sparse_combiner, weight_collections,
+              trainable))
+        else:
+          weighted_sums.append(_create_dense_column_weighted_sum(
+              column, builder, units, weight_collections, trainable))
+    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
+    predictions_no_bias = math_ops.add_n(
+        weighted_sums, name='weighted_sum_no_bias')
+    bias = variable_scope.get_variable(
+        'bias_weights',
+        shape=[units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
+    predictions = nn_ops.bias_add(
+        predictions_no_bias, bias, name='weighted_sum')
+
+    return predictions
+
+
+def _transform_features(features, feature_columns):
+  """Returns transformed features based on features columns passed in.
+
+  Please note that most probably you would not need to use this function. Please
+  check `input_layer` and `linear_model` to see whether they will
+  satisfy your use case or not.
+
+  Example:
+
+  ```python
+  # Define features and transformations
+  crosses_a_x_b = crossed_column(
+      columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)
+  price_buckets = bucketized_column(
+      source_column=numeric_column("price"), boundaries=[...])
+
+  columns = [crosses_a_x_b, price_buckets]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  transformed = transform_features(features=features, feature_columns=columns)
+
+  assertCountEqual(columns, transformed.keys())
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
+      corresponding `_FeatureColumn`.
+    feature_columns: An iterable containing all the `_FeatureColumn`s.
+
+  Returns:
+    A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.
+  """
+  _check_feature_columns(feature_columns)
+  outputs = {}
+  with ops.name_scope(
+      None, default_name='transform_features', values=features.values()):
+    builder = _LazyBuilder(features)
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      with ops.name_scope(None, default_name=column.name):
+        outputs[column] = builder.get(column)
+  return outputs
+
+
+def make_parse_example_spec(feature_columns):
+  """Creates parsing spec dictionary from input feature_columns.
+
+  The returned dictionary can be used as arg 'features' in `tf.parse_example`.
+
+  Typical usage example:
+
+  ```python
+  # Define features and transformations
+  feature_b = numeric_column(...)
+  feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
+  feature_a_x_feature_c = crossed_column(
+      columns=["feature_a", feature_c_bucketized], ...)
+
+  feature_columns = set(
+      [feature_b, feature_c_bucketized, feature_a_x_feature_c])
+  features = tf.parse_example(
+      serialized=serialized_examples,
+      features=make_parse_example_spec(feature_columns))
+  ```
+
+  For the above example, make_parse_example_spec would return the dict:
+  {
+    "feature_a": parsing_ops.VarLenFeature(tf.string),
+    "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
+    "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
+  }
+
+  Args:
+    feature_columns: An iterable containing all feature columns. All items
+      should be instances of classes derived from `_FeatureColumn`.
+
+  Returns:
+    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
+    value.
+
+  Raises:
+    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
+      instance.
+  """
+  result = {}
+  for column in feature_columns:
+    if not isinstance(column, _FeatureColumn):
+      raise ValueError(
+          'All feature_columns must be _FeatureColumn instances. '
+          'Given: {}'.format(column))
+    config = column._parse_example_spec  # pylint: disable=protected-access
+    for key, value in six.iteritems(config):
+      if key in result and value != result[key]:
+        raise ValueError(
+            'feature_columns contain different parse_spec for key '
+            '{}. Given {} and {}'.format(key, value, result[key]))
+    result.update(config)
+  return result
+
+
+def embedding_column(
+    categorical_column, dimension, combiner='mean', initializer=None,
+    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
+    trainable=True):
+  """`_DenseColumn` that converts from sparse, categorical input.
+
+  Use this when your inputs are sparse, but you want to convert them to a dense
+  representation (e.g., to feed to a DNN).
+
+  Inputs must be a `_CategoricalColumn` created by any of the
+  `categorical_column_*` function. Here is an example embedding of an identity
+  column for a DNN model:
+
+  ```python
+  video_id = categorical_column_with_identity(
+      key='video_id', num_buckets=1000000, default_value=0)
+  columns = [embedding_column(video_id, 9),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    categorical_column: A `_CategoricalColumn` created by a
+      `categorical_column_with_*` function. This column produces the sparse IDs
+      that are inputs to the embedding lookup.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries
+      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+      with bag-of-words columns. Each of this can be thought as example level
+      normalizations on the column. For more information, see
+      `tf.embedding_lookup_sparse`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+    ckpt_to_load_from: String representing checkpoint name/pattern from which to
+      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
+      which to restore the column weights. Required if `ckpt_to_load_from` is
+      not `None`.
+    max_norm: If not `None`, embedding values are l2-normalized to this value.
+    trainable: Whether or not the embedding is trainable. Default is True.
+
+  Returns:
+    `_DenseColumn` that converts from sparse input.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
+      is specified.
+    ValueError: if `initializer` is specified and is not callable.
+  """
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+    raise ValueError('Must specify both `ckpt_to_load_from` and '
+                     '`tensor_name_in_ckpt` or none of them.')
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified. '
+                     'Embedding of column_name: {}'.format(
+                         categorical_column.name))
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1 / math.sqrt(dimension))
+
+  return _EmbeddingColumn(
+      categorical_column=categorical_column,
+      dimension=dimension,
+      combiner=combiner,
+      initializer=initializer,
+      ckpt_to_load_from=ckpt_to_load_from,
+      tensor_name_in_ckpt=tensor_name_in_ckpt,
+      max_norm=max_norm,
+      trainable=trainable)
+
+
+def numeric_column(key,
+                   shape=(1,),
+                   default_value=None,
+                   dtype=dtypes.float32,
+                   normalizer_fn=None):
+  """Represents real valued or numerical features.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  columns = [price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+
+  # or
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    shape: An iterable of integers specifies the shape of the `Tensor`. An
+      integer can be given which means a single dimension `Tensor` with given
+      width. The `Tensor` representing the column will have the shape of
+      [batch_size] + `shape`.
+    default_value: A single value compatible with `dtype` or an iterable of
+      values compatible with `dtype` which the column takes on during
+      `tf.Example` parsing if data is missing. A default value of `None` will
+      cause `tf.parse_example` to fail if an example does not contain this
+      column. If a single value is provided, the same value will be applied as
+      the default value for every item. If an iterable of values is provided,
+      the shape of the `default_value` should be equal to the given `shape`.
+    dtype: defines the type of values. Default value is `tf.float32`. Must be a
+      non-quantized, real integer or floating point type.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
+
+  Returns:
+    A `_NumericColumn`.
+
+  Raises:
+    TypeError: if any dimension in shape is not an int
+    ValueError: if any dimension in shape is not a positive integer
+    TypeError: if `default_value` is an iterable but not compatible with `shape`
+    TypeError: if `default_value` is not compatible with `dtype`.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  shape = _check_shape(shape, key)
+  if not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype must be convertible to float. '
+                     'dtype: {}, key: {}'.format(dtype, key))
+  default_value = _check_default_value(shape, default_value, dtype, key)
+
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
+
+  return _NumericColumn(
+      key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
+
+
+def bucketized_column(source_column, boundaries):
+  """Represents discretized dense input.
+
+  Buckets include the left boundary, and exclude the right boundary. Namely,
+  `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
+  `[1., 2.)`, and `[2., +inf)`.
+
+  For example, if the inputs are
+    `boundaries` = [0, 10, 100]
+    input tensor = [[-5, 10000]
+                    [150,   10]
+                    [5,    100]]
+
+  then the output will be
+    output = [[0, 3]
+              [3, 2]
+              [1, 3]]
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+
+  # or
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  `bucketized_column` can also be crossed with another categorical column using
+  `crossed_column`:
+  ```python
+  price = numeric_column('price')
+  # bucketized_column converts numerical feature to a categorical one.
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  # 'keywords' is a string feature.
+  price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
+  columns = [price_x_keywords, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    source_column: A one-dimensional dense column which is generated with
+      `numeric_column`.
+    boundaries: A sorted list or tuple of floats specifying the boundaries.
+
+  Returns:
+    A `_BucketizedColumn`.
+
+  Raises:
+    ValueError: If `source_column` is not a numeric column, or if it is not
+      one-dimensional.
+    ValueError: If `boundaries` is not a sorted list or tuple.
+  """
+  if not isinstance(source_column, _NumericColumn):
+    raise ValueError(
+        'source_column must be a column generated with numeric_column(). '
+        'Given: {}'.format(source_column))
+  if len(source_column.shape) > 1:
+    raise ValueError(
+        'source_column must be one-dimensional column. '
+        'Given: {}'.format(source_column))
+  if (not boundaries or
+      not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
+    raise ValueError('boundaries must be a sorted list.')
+  for i in range(len(boundaries) - 1):
+    if boundaries[i] >= boundaries[i + 1]:
+      raise ValueError('boundaries must be a sorted list.')
+  return _BucketizedColumn(source_column, tuple(boundaries))
+
+
+def _assert_string_or_int(dtype, prefix):
+  if (dtype != dtypes.string) and (not dtype.is_integer):
+    raise ValueError(
+        '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
+
+
+def categorical_column_with_hash_bucket(key,
+                                        hash_bucket_size,
+                                        dtype=dtypes.string):
+  """Represents sparse feature where ids are set by hashing.
+
+  Use this when your sparse features are in string or integer format, and you
+  want to distribute your inputs into a finite number of buckets by hashing.
+  output_id = Hash(input_feature_string) % bucket_size
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  Example:
+
+  ```python
+  keywords = categorical_column_with_hash_bucket("keywords", 10K)
+  columns = [keywords, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+
+  # or
+  keywords_embedded = embedding_column(keywords, 16)
+  columns = [keywords_embedded, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    hash_bucket_size: An int > 1. The number of buckets.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_HashedCategoricalColumn`.
+
+  Raises:
+    ValueError: `hash_bucket_size` is not greater than 1.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  if hash_bucket_size is None:
+    raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))
+
+  if hash_bucket_size < 1:
+    raise ValueError('hash_bucket_size must be at least 1. '
+                     'hash_bucket_size: {}, key: {}'.format(
+                         hash_bucket_size, key))
+
+  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+
+  return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
+
+
+def categorical_column_with_vocabulary_file(
+    key, vocabulary_file, vocabulary_size, num_oov_buckets=0,
+    default_value=None, dtype=dtypes.string):
+  """A `_CategoricalColumn` with a vocabulary file.
+
+  Use this when your inputs are in string or integer format, and you have a
+  vocabulary file that maps each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use either (but not both) of
+  `num_oov_buckets` and `default_value` to specify how to include
+  out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  Example with `num_oov_buckets`:
+  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
+  abbreviation. All inputs with values in that file are assigned an ID 0-49,
+  corresponding to its line number. All other values are hashed and assigned an
+  ID 50-54.
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Example with `default_value`:
+  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
+  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
+  in input, and other values missing from the file, will be assigned ID 0. All
+  others are assigned the corresponding line number 1-50.
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
+      default_value=0)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  And to make an embedding with either:
+  ```python
+  columns = [embedding_column(states, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_CategoricalColumn` with a vocabulary file.
+
+  Raises:
+    ValueError: `vocabulary_file` is missing.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is not a non-negative integer.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  if not vocabulary_file:
+    raise ValueError('Missing vocabulary_file in {}.'.format(key))
+  # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
+  if (vocabulary_size is None) or (vocabulary_size < 1):
+    raise ValueError('Invalid vocabulary_size in {}.'.format(key))
+  if num_oov_buckets:
+    if default_value is not None:
+      raise ValueError(
+          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
+              key))
+    if num_oov_buckets < 0:
+      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
+          num_oov_buckets, key))
+  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  return _VocabularyFileCategoricalColumn(
+      key=key,
+      vocabulary_file=vocabulary_file,
+      vocabulary_size=vocabulary_size,
+      num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
+      default_value=-1 if default_value is None else default_value,
+      dtype=dtype)
+
+
+def categorical_column_with_vocabulary_list(
+    key, vocabulary_list, dtype=None, default_value=-1):
+  """A `_CategoricalColumn` with in-memory vocabulary.
+
+  Logic for feature f is:
+  id = vocabulary_list.index_of(f) if f in vocabulary_list else default_value
+
+  Use this when your inputs are in string or integer format, and you have an
+  in-memory vocabulary mapping each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use `default_value` to specify how to
+  include out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  In the following examples, each input in `vocabulary_list` is assigned an ID
+  0-4 corresponding to its index (e.g., input 'B' produces output 2). All other
+  inputs are assigned `default_value` 0.
+
+  Linear model:
+  ```python
+  colors = categorical_column_with_vocabulary_list(
+      key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
+  columns = [colors, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  Embedding for a DNN model:
+  ```python
+  columns = [embedding_column(colors, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
+      is mapped to the index of its value (if present) in `vocabulary_list`.
+      Must be castable to `dtype`.
+    dtype: The type of features. Only string and integer types are supported.
+      If `None`, it will be inferred from `vocabulary_list`.
+    default_value: The value to use for values not in `vocabulary_list`.
+
+  Returns:
+    A `_CategoricalColumn` with in-memory vocabulary.
+
+  Raises:
+    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
+    ValueError: if `dtype` is not integer or string.
+  """
+  if (vocabulary_list is None) or (len(vocabulary_list) < 1):
+    raise ValueError(
+        'vocabulary_list {} must be non-empty, column_name: {}'.format(
+            vocabulary_list, key))
+  if len(set(vocabulary_list)) != len(vocabulary_list):
+    raise ValueError(
+        'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
+            vocabulary_list, key))
+  vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
+  _assert_string_or_int(
+      vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
+  if dtype is None:
+    dtype = vocabulary_dtype
+  elif dtype.is_integer != vocabulary_dtype.is_integer:
+    raise ValueError(
+        'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
+            dtype, vocabulary_dtype, key))
+  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+
+  return _VocabularyListCategoricalColumn(
+      key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
+      default_value=default_value)
+
+
+def categorical_column_with_identity(key, num_buckets, default_value=None):
+  """A `_CategoricalColumn` that returns identity values.
+
+  Use this when your inputs are integers in the range `[0, num_buckets)`, and
+  you want to use the input value itself as the categorical ID. Values outside
+  this range will result in `default_value` if specified, otherwise it will
+  fail.
+
+  Typically, this is used for contiguous ranges of integer indexes, but
+  it doesn't have to be. This might be inefficient, however, if many of IDs
+  are unused. Consider `categorical_column_with_hash_bucket` in that case.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  In the following examples, each input in the range `[0, 1000000)` is assigned
+  the same value. All other inputs are assigned `default_value` 0. Note that a
+  literal 0 in inputs will result in the same default ID.
+
+  Linear model:
+  ```python
+  video_id = categorical_column_with_identity(
+      key='video_id', num_buckets=1000000, default_value=0)
+  columns = [video_id, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  Embedding for a DNN model:
+  ```python
+  columns = [embedding_column(video_id, 9),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
+    default_value: If `None`, this column's graph operations will fail for
+      out-of-range inputs. Otherwise, this value must be in the range
+      `[0, num_buckets)`, and will replace inputs in that range.
+
+  Returns:
+    A `_CategoricalColumn` that returns identity values.
+
+  Raises:
+    ValueError: if `num_buckets` is less than one.
+    ValueError: if `default_value` is not in range `[0, num_buckets)`.
+  """
+  if num_buckets < 1:
+    raise ValueError(
+        'num_buckets {} < 1, column_name {}'.format(num_buckets, key))
+  if (default_value is not None) and (
+      (default_value < 0) or (default_value >= num_buckets)):
+    raise ValueError(
+        'default_value {} not in range [0, {}), column_name {}'.format(
+            default_value, num_buckets, key))
+  return _IdentityCategoricalColumn(
+      key=key, num_buckets=num_buckets, default_value=default_value)
+
+
+def indicator_column(categorical_column):
+  """Represents multi-hot representation of given categorical column.
+
+  Used to wrap any `categorical_column_*` (e.g., to feed to DNN). Use
+  `embedding_column` if the inputs are sparse.
+
+  ```python
+  name = indicator_column(categorical_column_with_vocabulary_list('name',
+      ['bob', 'george', 'wanda'])
+  columns = [name, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+
+  dense_tensor == [[1, 0, 0]]  # If "name" bytes_list is ["bob"]
+  dense_tensor == [[1, 0, 1]]  # If "name" bytes_list is ["bob", "wanda"]
+  dense_tensor == [[2, 0, 0]]  # If "name" bytes_list is ["bob", "bob"]
+  ```
+
+  Args:
+    categorical_column: A `_CategoricalColumn` which is created by
+      `categorical_column_with_*` or `crossed_column` functions.
+
+  Returns:
+    An `_IndicatorColumn`.
+  """
+  return _IndicatorColumn(categorical_column)
+
+
+def weighted_categorical_column(
+    categorical_column, weight_feature_key, dtype=dtypes.float32):
+  """Applies weight values to a `_CategoricalColumn`.
+
+  Use this when each of your sparse inputs has both an ID and a value. For
+  example, if you're representing text documents as a collection of word
+  frequencies, you can provide 2 parallel sparse input features ('terms' and
+  'frequencies' below).
+
+  Example:
+
+  Input `tf.Example` objects:
+  [
+    features {
+      feature {
+        key: "terms"
+        value {bytes_list {value: "very" value: "model"}}
+      }
+      feature {
+        key: "frequencies"
+        value {float_list {value: 0.3 value: 0.1}}
+      }
+    },
+    features {
+      feature {
+        key: "terms"
+        value {bytes_list {value: "when" value: "course" value: "human"}}
+      }
+      feature {
+        key: "frequencies"
+        value {float_list {value: 0.4 value: 0.1 value: 0.2}}
+      }
+    }
+  ]
+
+  ```python
+  categorical_column = categorical_column_with_hash_bucket(
+      column_name='terms', hash_bucket_size=1000)
+  weighted_column = weighted_categorical_column(
+      categorical_column=categorical_column, weight_feature_key='frequencies')
+  columns = [weighted_column, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  This assumes the input dictionary contains a `SparseTensor` for key
+  'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have
+  the same indices and dense shape.
+
+  Args:
+    categorical_column: A `_CategoricalColumn` created by
+      `categorical_column_with_*` functions.
+    weight_feature_key: String key for weight values.
+    dtype: Type of weights, such as `tf.float32`. Only float and integer weights
+      are supported.
+
+  Returns:
+    A `_CategoricalColumn` composed of two sparse features: one represents id,
+    the other represents weight (value) of the id feature in that example.
+
+  Raises:
+    ValueError: if `dtype` is not convertible to float.
+  """
+  if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype {} is not convertible to float.'.format(dtype))
+  return _WeightedCategoricalColumn(
+      categorical_column=categorical_column,
+      weight_feature_key=weight_feature_key,
+      dtype=dtype)
+
+
+def crossed_column(keys, hash_bucket_size, hash_key=None):
+  """Returns a column for performing crosses of categorical features.
+
+  Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
+  the transformation can be thought of as:
+    Hash(cartesian product of features) % `hash_bucket_size`
+
+  For example, if the input features are:
+  * SparseTensor referred by first key: shape = [2, 2]
+      [0, 0]: "a"
+      [1, 0]: "b"
+      [1, 1]: "c"
+
+  * SparseTensor referred by second key: shape = [2, 1]
+      [0, 0]: "d"
+      [1, 0]: "e"
+
+  then crossed feature will look like:
+      shape = [2, 2]
+      [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
+      [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
+      [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
+
+  Here is an example to create a linear model with crosses of string features:
+  ```python
+  keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
+  columns = [keywords_x_doc_terms, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  You could also use vocabulary lookup before crossing:
+  ```python
+  keywords = categorical_column_with_vocabulary_file(
+      'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
+  keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
+  columns = [keywords_x_doc_terms, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  If an input feature is of numeric type, you can use
+  `categorical_column_with_identity`, or `bucketized_column`, as in the example:
+  ```python
+  # vertical_id is an integer categorical feature.
+  vertical_id = categorical_column_with_identity('vertical_id', 10K)
+  price = numeric_column('price')
+  # bucketized_column converts numerical feature to a categorical one.
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
+  columns = [vertical_id_x_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  To use crossed column in DNN model, you need to add it in an embedding column
+  as in this example:
+  ```python
+  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
+  vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
+  dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])
+  ```
+
+  Args:
+    keys: An iterable identifying the features to be crossed. Each element can
+      be either:
+      * string: Will use the corresponding feature which must be of string type.
+      * `_CategoricalColumn`: Will use the transformed tensor produced by this
+        column. Does not support hashed categorical column.
+    hash_bucket_size: An int > 1. The number of buckets.
+    hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+      function to combine the crosses fingerprints on SparseCrossOp (optional).
+
+  Returns:
+    A `_CrossedColumn`.
+
+  Raises:
+    ValueError: If `len(keys) < 2`.
+    ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
+    ValueError: If any of the keys is `_HashedCategoricalColumn`.
+    ValueError: If `hash_bucket_size < 1`.
+  """
+  if not hash_bucket_size or hash_bucket_size < 1:
+    raise ValueError('hash_bucket_size must be > 1. '
+                     'hash_bucket_size: {}'.format(hash_bucket_size))
+  if not keys or len(keys) < 2:
+    raise ValueError(
+        'keys must be a list with length > 1. Given: {}'.format(keys))
+  for key in keys:
+    if (not isinstance(key, six.string_types) and
+        not isinstance(key, _CategoricalColumn)):
+      raise ValueError(
+          'Unsupported key type. All keys must be either string, or '
+          'categorical column except _HashedCategoricalColumn. '
+          'Given: {}'.format(key))
+    if isinstance(key, _HashedCategoricalColumn):
+      raise ValueError(
+          '_HashedCategoricalColumn is not supported. Instead, use the feature '
+          'name as a string. Given: {}'.format(key))
+  return _CrossedColumn(
+      keys=tuple(keys), hash_bucket_size=hash_bucket_size,
+      hash_key=hash_key)
+
+
+class _FeatureColumn(object):
+  """Represents a feature column abstraction.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  To distinguish the concept of a feature family and a specific binary feature
+  within a family, we refer to a feature family like "country" as a feature
+  column. Following is an example feature in a `tf.Example` format:
+    {key: "country",  value: [ "US" ]}
+  In this example the value of feature is "US" and "country" refers to the
+  column of the feature.
+
+  This class is an abstract class. User should not create instances of this.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def name(self):
+    """Returns string. used for variable_scope and naming."""
+    pass
+
+  @abc.abstractmethod
+  def _transform_feature(self, inputs):
+    """Returns intermediate representation (usually a `Tensor`).
+
+    Uses `inputs` to create an intermediate representation (usually a `Tensor`)
+    that other feature columns can use.
+
+    Example usage of `inputs`:
+    Let's say a Feature column depends on raw feature ('raw') and another
+    `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will
+    be used as follows:
+
+    ```python
+    raw_tensor = inputs.get('raw')
+    fc_tensor = inputs.get(input_fc)
+    ```
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+
+    Returns:
+      Transformed feature `Tensor`.
+    """
+    pass
+
+  @abc.abstractproperty
+  def _parse_example_spec(self):
+    """Returns a `tf.Example` parsing spec as dict.
+
+    It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a
+    dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
+    supported objects. Please check documentation of ${tf.parse_example} for all
+    supported spec objects.
+
+    Let's say a Feature column depends on raw feature ('raw') and another
+    `_FeatureColumn` (input_fc). One possible implementation of
+    _parse_example_spec is as follows:
+
+    ```python
+    spec = {'raw': tf.FixedLenFeature(...)}
+    spec.update(input_fc._parse_example_spec)
+    return spec
+    ```
+    """
+    pass
+
+
+class _DenseColumn(_FeatureColumn):
+  """Represents a column which can be represented as `Tensor`.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  Some examples of this type are: numeric_column, embedding_column,
+  indicator_column.
+  """
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def _variable_shape(self):
+    """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
+    pass
+
+  @abc.abstractmethod
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    """Returns a `Tensor`.
+
+    The output of this function will be used by model-builder-functions. For
+    example the pseudo code of `input_layer` will be like:
+
+    ```python
+    def input_layer(features, feature_columns, ...):
+      outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
+      return tf.concat(outputs)
+    ```
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+      weight_collections: List of graph collections to which Variables (if any
+        will be created) are added.
+      trainable: If `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.Variable}).
+
+    Returns:
+      `Tensor` of shape [batch_size] + `_variable_shape`.
+    """
+    pass
+
+
+def _create_dense_column_weighted_sum(
+    column, builder, units, weight_collections, trainable):
+  """Create a weighted sum of a dense column for linear_model."""
+  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
+  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+  batch_size = array_ops.shape(tensor)[0]
+  tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+  weight = variable_scope.get_variable(
+      name='weights',
+      shape=[num_elements, units],
+      initializer=init_ops.zeros_initializer(),
+      trainable=trainable,
+      collections=weight_collections)
+  return math_ops.matmul(tensor, weight, name='weighted_sum')
+
+
+class _CategoricalColumn(_FeatureColumn):
+  """Represents a categorical feature.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  A categorical feature typically handled with a ${tf.SparseTensor} of IDs.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
+      'IdWeightPair', ['id_tensor', 'weight_tensor'])
+
+  @abc.abstractproperty
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    pass
+
+  @abc.abstractmethod
+  def _get_sparse_tensors(self,
+                          inputs,
+                          weight_collections=None,
+                          trainable=None):
+    """Returns an IdWeightPair.
+
+    `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
+    weights.
+
+    `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`
+    `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a
+    `SparseTensor` of `float` or `None` to indicate all weights should be
+    taken to be 1. If specified, `weight_tensor` must have exactly the same
+    shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing
+    output of a `VarLenFeature` which is a ragged matrix.
+
+    Args:
+      inputs: A `LazyBuilder` as a cache to get input tensors required to
+        create `IdWeightPair`.
+      weight_collections: List of graph collections to which variables (if any
+        will be created) are added.
+      trainable: If `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.get_variable}).
+    """
+    pass
+
+
+def _create_categorical_column_weighted_sum(
+    column, builder, units, sparse_combiner, weight_collections, trainable):
+  """Create a weighted sum of a categorical column for linear_model."""
+  sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
+  id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [
+      array_ops.shape(sparse_tensors.id_tensor)[0], -1
+  ])
+  weight_tensor = sparse_tensors.weight_tensor
+  if weight_tensor is not None:
+    weight_tensor = sparse_ops.sparse_reshape(
+        weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
+
+  weight = variable_scope.get_variable(
+      name='weights',
+      shape=(column._num_buckets, units),  # pylint: disable=protected-access
+      initializer=init_ops.zeros_initializer(),
+      trainable=trainable,
+      collections=weight_collections)
+  return _safe_embedding_lookup_sparse(
+      weight,
+      id_tensor,
+      sparse_weights=weight_tensor,
+      combiner=sparse_combiner,
+      name='weighted_sum')
+
+
+class _LazyBuilder(object):
+  """Handles caching of transformations while building the model.
+
+  `_FeatureColumn` specifies how to digest an input column to the network. Some
+  feature columns require data transformations. This class caches those
+  transformations.
+
+  Some features may be used in more than one place. For example, one can use a
+  bucketized feature by itself and a cross with it. In that case we
+  should create only one bucketization op instead of creating ops for each
+  feature column separately. To handle re-use of transformed columns,
+  `_LazyBuilder` caches all previously transformed columns.
+
+  Example:
+  We're trying to use the following `_FeatureColumn`s:
+
+  ```python
+    bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
+    keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
+    age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
+    ... = linear_model(features,
+                            [bucketized_age, keywords, age_X_keywords]
+  ```
+
+  If we transform each column independently, then we'll get duplication of
+  bucketization (one for cross, one for bucketization itself).
+  The `_LazyBuilder` eliminates this duplication.
+  """
+
+  def __init__(self, features):
+    """Creates a `_LazyBuilder`.
+
+    Args:
+      features: A mapping from feature column to objects that are `Tensor` or
+        `SparseTensor`, or can be converted to same via
+        `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
+        signifies a base feature (not-transformed). A `_FeatureColumn` key
+        means that this `Tensor` is the output of an existing `_FeatureColumn`
+        which can be reused.
+    """
+    self._features = features.copy()
+    self._feature_tensors = {}
+
+  def get(self, key):
+    """Returns a `Tensor` for the given key.
+
+    A `str` key is used to access a base feature (not-transformed). When a
+    `_FeatureColumn` is passed, the transformed feature is returned if it
+    already exists, otherwise the given `_FeatureColumn` is asked to provide its
+    transformed output, which is then cached.
+
+    Args:
+      key: a `str` or a `_FeatureColumn`.
+
+    Returns:
+      The transformed `Tensor` corresponding to the `key`.
+
+    Raises:
+      ValueError: if key is not found or a transformed `Tensor` cannot be
+        computed.
+    """
+    if key in self._feature_tensors:
+      # FeatureColumn is already transformed or converted.
+      return self._feature_tensors[key]
+
+    if key in self._features:
+      # FeatureColumn is a raw feature.
+      feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+          self._features[key])
+      self._feature_tensors[key] = feature_tensor
+      return feature_tensor
+
+    if not isinstance(key, (str, _FeatureColumn)):
+      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
+                      'Provided: {}'.format(key))
+
+    if not isinstance(key, _FeatureColumn):
+      raise ValueError('Feature {} is not in features dictionary.'.format(key))
+
+    column = key
+    logging.debug('Transforming feature_column %s.', column)
+    transformed = column._transform_feature(self)  # pylint: disable=protected-access
+    if transformed is None:
+      raise ValueError('Column {} is not supported.'.format(column.name))
+    self._feature_tensors[column] = transformed
+    return transformed
+
+
+# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
+def _shape_offsets(shape):
+  """Returns moving offset for each dimension given shape."""
+  offsets = []
+  for dim in reversed(shape):
+    if offsets:
+      offsets.append(dim * offsets[-1])
+    else:
+      offsets.append(dim)
+  offsets.reverse()
+  return offsets
+
+
+# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
+def _to_sparse_input(input_tensor, ignore_value=None):
+  """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
+
+  If `input_tensor` is already a `SparseTensor`, just return it.
+
+  Args:
+    input_tensor: A string or integer `Tensor`.
+    ignore_value: Entries in `dense_tensor` equal to this value will be
+      absent from the resulting `SparseTensor`. If `None`, default value of
+      `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).
+
+  Returns:
+    A `SparseTensor` with the same shape as `input_tensor`.
+
+  Raises:
+    ValueError: when `input_tensor`'s rank is `None`.
+  """
+  input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+      input_tensor)
+  if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+    return input_tensor
+  with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
+    input_rank = input_tensor.get_shape().ndims
+    if input_rank is None:
+      # TODO(b/32318825): Implement dense_to_sparse_tensor for undefined rank.
+      raise ValueError('Undefined input_tensor shape.')
+    if ignore_value is None:
+      ignore_value = '' if input_tensor.dtype == dtypes.string else -1
+    dense_shape = math_ops.cast(array_ops.shape(input_tensor), dtypes.int64)
+    indices = array_ops.where(math_ops.not_equal(
+        input_tensor, math_ops.cast(ignore_value, input_tensor.dtype)))
+    # Flattens the tensor and indices for use with gather.
+    flat_tensor = array_ops.reshape(input_tensor, [-1])
+    flat_indices = indices[:, input_rank - 1]
+    # Computes the correct flattened indices for 2d (or higher) tensors.
+    if input_rank > 1:
+      higher_dims = indices[:, :input_rank - 1]
+      shape_offsets = array_ops.stack(
+          _shape_offsets(array_ops.unstack(dense_shape)[1:]))
+      offsets = math_ops.reduce_sum(
+          math_ops.multiply(higher_dims, shape_offsets),
+          reduction_indices=[1])
+      flat_indices = math_ops.add(flat_indices, offsets)
+    values = array_ops.gather(flat_tensor, flat_indices)
+    return sparse_tensor_lib.SparseTensor(indices, values, dense_shape)
+
+
+def _check_feature_columns(feature_columns):
+  """Verifies feature_columns input."""
+  if isinstance(feature_columns, dict):
+    raise ValueError('Expected feature_columns to be iterable, found dict.')
+  for column in feature_columns:
+    if not isinstance(column, _FeatureColumn):
+      raise ValueError('Items of feature_columns must be a _FeatureColumn.'
+                       'Given: {}.'.format(column))
+  if not feature_columns:
+    raise ValueError('feature_columns must not be empty.')
+  name_to_column = dict()
+  for column in feature_columns:
+    if column.name in name_to_column:
+      raise ValueError('Duplicate feature column name found for columns: {} '
+                       'and {}. This usually means that these columns refer to '
+                       'same base feature. Either one must be discarded or a '
+                       'duplicated but renamed item must be inserted in '
+                       'features dict.'.format(column,
+                                               name_to_column[column.name]))
+    name_to_column[column.name] = column
+
+
+class _NumericColumn(_DenseColumn,
+                     collections.namedtuple('_NumericColumn', [
+                         'key', 'shape', 'default_value', 'dtype',
+                         'normalizer_fn'
+                     ])):
+  """see `numeric_column`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {
+        self.key:
+            parsing_ops.FixedLenFeature(self.shape, self.dtype,
+                                        self.default_value)
+    }
+
+  def _transform_feature(self, inputs):
+    input_tensor = inputs.get(self.key)
+    if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+      raise ValueError(
+          'The corresponding Tensor of numerical column must be a Tensor. '
+          'SparseTensor is not supported. key: {}'.format(self.key))
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return math_ops.to_float(input_tensor)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(self.shape)
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    """Returns dense `Tensor` representing numeric feature.
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+      weight_collections: Unused `weight_collections` since no variables are
+        created in this function.
+      trainable: Unused `trainable` bool since no variables are created in
+        this function.
+
+    Returns:
+      Dense `Tensor` created within `_transform_feature`.
+    """
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    return inputs.get(self)
+
+
+class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
+                        collections.namedtuple('_BucketizedColumn', [
+                            'source_column', 'boundaries'])):
+  """See `bucketized_column`."""
+
+  @property
+  def name(self):
+    return '{}_bucketized'.format(self.source_column.name)
+
+  @property
+  def _parse_example_spec(self):
+    return self.source_column._parse_example_spec  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    source_tensor = inputs.get(self.source_column)
+    return math_ops._bucketize(  # pylint: disable=protected-access
+        source_tensor,
+        boundaries=self.boundaries)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(
+        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return array_ops.one_hot(
+        indices=math_ops.to_int64(input_tensor),
+        depth=len(self.boundaries) + 1,
+        on_value=1.,
+        off_value=0.)
+
+  @property
+  def _num_buckets(self):
+    # By construction, source_column is always one-dimensional.
+    return (len(self.boundaries) + 1) * self.source_column.shape[0]
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    input_tensor = inputs.get(self)
+    batch_size = array_ops.shape(input_tensor)[0]
+    # By construction, source_column is always one-dimensional.
+    source_dimension = self.source_column.shape[0]
+
+    i1 = array_ops.reshape(
+        array_ops.tile(
+            array_ops.expand_dims(math_ops.range(0, batch_size), 1),
+            [1, source_dimension]),
+        (-1,))
+    i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
+    # Flatten the bucket indices and unique them across dimensions
+    # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
+    bucket_indices = (
+        array_ops.reshape(input_tensor, (-1,)) +
+        (len(self.boundaries) + 1) * i2)
+
+    indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
+    dense_shape = math_ops.to_int64(array_ops.stack(
+        [batch_size, source_dimension]))
+    sparse_tensor = sparse_tensor_lib.SparseTensor(
+        indices=indices,
+        values=bucket_indices,
+        dense_shape=dense_shape)
+    return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
+
+
+class _EmbeddingColumn(
+    _DenseColumn,
+    collections.namedtuple('_EmbeddingColumn', (
+        'categorical_column', 'dimension', 'combiner', 'initializer',
+        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
+    ))):
+  """See `_embedding_column`."""
+
+  @property
+  def name(self):
+    if not hasattr(self, '_name'):
+      self._name = '{}_embedding'.format(self.categorical_column.name)
+    return self._name
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.categorical_column)
+
+  @property
+  def _variable_shape(self):
+    if not hasattr(self, '_shape'):
+      self._shape = tensor_shape.vector(self.dimension)
+    return self._shape
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    # Get sparse IDs and weights.
+    sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
+        inputs, weight_collections=weight_collections, trainable=trainable)
+    sparse_ids = sparse_tensors.id_tensor
+    sparse_weights = sparse_tensors.weight_tensor
+
+    # Create embedding weight, and restore from checkpoint if necessary.
+    embedding_weights = variable_scope.get_variable(
+        name='embedding_weights',
+        shape=(self.categorical_column._num_buckets, self.dimension),  # pylint: disable=protected-access
+        dtype=dtypes.float32,
+        initializer=self.initializer,
+        trainable=self.trainable and trainable,
+        collections=weight_collections)
+    if self.ckpt_to_load_from is not None:
+      to_restore = embedding_weights
+      if isinstance(to_restore, variables.PartitionedVariable):
+        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
+      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
+          self.tensor_name_in_ckpt: to_restore
+      })
+
+    # Return embedding lookup result.
+    return _safe_embedding_lookup_sparse(
+        embedding_weights=embedding_weights,
+        sparse_ids=sparse_ids,
+        sparse_weights=sparse_weights,
+        combiner=self.combiner,
+        name='%s_weights' % self.name,
+        max_norm=self.max_norm)
+
+
+def _create_tuple(shape, value):
+  """Returns a tuple with given shape and filled with value."""
+  if shape:
+    return tuple([_create_tuple(shape[1:], value) for _ in range(shape[0])])
+  return value
+
+
+def _as_tuple(value):
+  if not nest.is_sequence(value):
+    return value
+  return tuple([_as_tuple(v) for v in value])
+
+
+def _check_shape(shape, key):
+  """Returns shape if it's valid, raises error otherwise."""
+  assert shape is not None
+  if not nest.is_sequence(shape):
+    shape = [shape]
+  shape = tuple(shape)
+  for dimension in shape:
+    if not isinstance(dimension, int):
+      raise TypeError('shape dimensions must be integer. '
+                      'shape: {}, key: {}'.format(shape, key))
+    if dimension < 1:
+      raise ValueError('shape dimensions must be greater than 0. '
+                       'shape: {}, key: {}'.format(shape, key))
+  return shape
+
+
+def _is_shape_and_default_value_compatible(default_value, shape):
+  """Verifies compatibility of shape and default_value."""
+  # Invalid condition:
+  #  * if default_value is not a scalar and shape is empty
+  #  * or if default_value is an iterable and shape is not empty
+  if nest.is_sequence(default_value) != bool(shape):
+    return False
+  if not shape:
+    return True
+  if len(default_value) != shape[0]:
+    return False
+  for i in range(shape[0]):
+    if not _is_shape_and_default_value_compatible(default_value[i], shape[1:]):
+      return False
+  return True
+
+
+def _check_default_value(shape, default_value, dtype, key):
+  """Returns default value as tuple if it's valid, otherwise raises errors.
+
+  This function verifies that `default_value` is compatible with both `shape`
+  and `dtype`. If it is not compatible, it raises an error. If it is compatible,
+  it casts default_value to a tuple and returns it. `key` is used only
+  for error message.
+
+  Args:
+    shape: An iterable of integers specifies the shape of the `Tensor`.
+    default_value: If a single value is provided, the same value will be applied
+      as the default value for every item. If an iterable of values is
+      provided, the shape of the `default_value` should be equal to the given
+      `shape`.
+    dtype: defines the type of values. Default value is `tf.float32`. Must be a
+      non-quantized, real integer or floating point type.
+    key: Column name, used only for error messages.
+
+  Returns:
+    A tuple which will be used as default value.
+
+  Raises:
+    TypeError: if `default_value` is an iterable but not compatible with `shape`
+    TypeError: if `default_value` is not compatible with `dtype`.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  if default_value is None:
+    return None
+
+  if isinstance(default_value, int):
+    return _create_tuple(shape, default_value)
+
+  if isinstance(default_value, float) and dtype.is_floating:
+    return _create_tuple(shape, default_value)
+
+  if callable(getattr(default_value, 'tolist', None)):  # Handles numpy arrays
+    default_value = default_value.tolist()
+
+  if nest.is_sequence(default_value):
+    if not _is_shape_and_default_value_compatible(default_value, shape):
+      raise ValueError(
+          'The shape of default_value must be equal to given shape. '
+          'default_value: {}, shape: {}, key: {}'.format(
+              default_value, shape, key))
+    # Check if the values in the list are all integers or are convertible to
+    # floats.
+    is_list_all_int = all(
+        isinstance(v, int) for v in nest.flatten(default_value))
+    is_list_has_float = any(
+        isinstance(v, float) for v in nest.flatten(default_value))
+    if is_list_all_int:
+      return _as_tuple(default_value)
+    if is_list_has_float and dtype.is_floating:
+      return _as_tuple(default_value)
+  raise TypeError('default_value must be compatible with dtype. '
+                  'default_value: {}, dtype: {}, key: {}'.format(
+                      default_value, dtype, key))
+
+
+class _HashedCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_HashedCategoricalColumn',
+                           ['key', 'hash_bucket_size', 'dtype'])):
+  """see `categorical_column_with_hash_bucket`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+    if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+      raise ValueError('SparseColumn input must be a SparseTensor.')
+
+    _assert_string_or_int(
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
+
+    if self.dtype.is_integer != input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
+
+    if self.dtype == dtypes.string:
+      sparse_values = input_tensor.values
+    else:
+      sparse_values = string_ops.as_string(input_tensor.values)
+
+    sparse_id_values = string_ops.string_to_hash_bucket_fast(
+        sparse_values, self.hash_bucket_size, name='lookup')
+    return sparse_tensor_lib.SparseTensor(
+        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.hash_bucket_size
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _VocabularyFileCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_VocabularyFileCategoricalColumn', (
+        'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype',
+        'default_value'
+    ))):
+  """See `categorical_column_with_vocabulary_file`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+
+    if self.dtype.is_integer != input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
+
+    _assert_string_or_int(
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
+
+    key_dtype = self.dtype
+    if input_tensor.dtype.is_integer:
+      # `index_table_from_file` requires 64-bit integer keys.
+      key_dtype = dtypes.int64
+      input_tensor = math_ops.to_int64(input_tensor)
+
+    return lookup_ops.index_table_from_file(
+        vocabulary_file=self.vocabulary_file,
+        num_oov_buckets=self.num_oov_buckets,
+        vocab_size=self.vocabulary_size,
+        default_value=self.default_value,
+        key_dtype=key_dtype,
+        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.vocabulary_size + self.num_oov_buckets
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _VocabularyListCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_VocabularyListCategoricalColumn', (
+        'key', 'vocabulary_list', 'dtype', 'default_value'
+    ))):
+  """See `categorical_column_with_vocabulary_list`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+
+    if self.dtype.is_integer != input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
+
+    _assert_string_or_int(
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
+
+    key_dtype = self.dtype
+    if input_tensor.dtype.is_integer:
+      # `index_table_from_tensor` requires 64-bit integer keys.
+      key_dtype = dtypes.int64
+      input_tensor = math_ops.to_int64(input_tensor)
+
+    return lookup_ops.index_table_from_tensor(
+        vocabulary_list=tuple(self.vocabulary_list),
+        default_value=self.default_value,
+        dtype=key_dtype,
+        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return len(self.vocabulary_list)
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _IdentityCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_IdentityCategoricalColumn', (
+        'key', 'num_buckets', 'default_value'
+    ))):
+
+  """See `categorical_column_with_identity`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+
+    if not input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Invalid input, not integer. key: {} dtype: {}'.format(
+              self.key, input_tensor.dtype))
+
+    values = math_ops.to_int64(input_tensor.values, name='values')
+    num_buckets = math_ops.to_int64(self.num_buckets, name='num_buckets')
+    zero = math_ops.to_int64(0, name='zero')
+    if self.default_value is None:
+      # Fail if values are out-of-range.
+      assert_less = check_ops.assert_less(
+          values, num_buckets, data=(values, num_buckets),
+          name='assert_less_than_num_buckets')
+      assert_greater = check_ops.assert_greater_equal(
+          values, zero, data=(values,),
+          name='assert_greater_or_equal_0')
+      with ops.control_dependencies((assert_less, assert_greater)):
+        values = array_ops.identity(values)
+    else:
+      # Assign default for out-of-range values.
+      values = array_ops.where(
+          math_ops.logical_or(
+              values < zero, values >= num_buckets, name='out_of_range'),
+          array_ops.fill(
+              dims=array_ops.shape(values),
+              value=math_ops.to_int64(self.default_value),
+              name='default_values'),
+          values)
+
+    return sparse_tensor_lib.SparseTensor(
+        indices=input_tensor.indices,
+        values=values,
+        dense_shape=input_tensor.dense_shape)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.num_buckets
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _WeightedCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_WeightedCategoricalColumn', (
+        'categorical_column', 'weight_feature_key', 'dtype'
+    ))):
+  """See `weighted_categorical_column`."""
+
+  @property
+  def name(self):
+    return '{}_weighted_by_{}'.format(
+        self.categorical_column.name, self.weight_feature_key)
+
+  @property
+  def _parse_example_spec(self):
+    config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+    if self.weight_feature_key in config:
+      raise ValueError('Parse config {} already exists for {}.'.format(
+          config[self.weight_feature_key], self.weight_feature_key))
+    config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
+    return config
+
+  @property
+  def _num_buckets(self):
+    return self.categorical_column._num_buckets  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    weight_tensor = inputs.get(self.weight_feature_key)
+    if weight_tensor is None:
+      raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
+    weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+        weight_tensor)
+    if self.dtype != weight_tensor.dtype.base_dtype:
+      raise ValueError('Bad dtype, expected {}, but got {}.'.format(
+          self.dtype, weight_tensor.dtype))
+    if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
+      # The weight tensor can be a regular Tensor. In this case, sparsify it.
+      weight_tensor = _to_sparse_input(weight_tensor, ignore_value=0.0)
+    if not weight_tensor.dtype.is_floating:
+      weight_tensor = math_ops.to_float(weight_tensor)
+    return (inputs.get(self.categorical_column), weight_tensor)
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    tensors = inputs.get(self)
+    return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
+
+
+class _CrossedColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_CrossedColumn',
+                           ['keys', 'hash_bucket_size', 'hash_key'])):
+  """See `crossed_column`."""
+
+  @property
+  def name(self):
+    feature_names = []
+    for key in _collect_leaf_level_keys(self):
+      if isinstance(key, _FeatureColumn):
+        feature_names.append(key.name)
+      else:  # key must be a string
+        feature_names.append(key)
+    return '_X_'.join(sorted(feature_names))
+
+  @property
+  def _parse_example_spec(self):
+    config = {}
+    for key in self.keys:
+      if isinstance(key, _FeatureColumn):
+        config.update(key._parse_example_spec)  # pylint: disable=protected-access
+      else:  # key must be a string
+        config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
+    return config
+
+  def _transform_feature(self, inputs):
+    feature_tensors = []
+    for key in _collect_leaf_level_keys(self):
+      if isinstance(key, six.string_types):
+        feature_tensors.append(inputs.get(key))
+      elif isinstance(key, _CategoricalColumn):
+        ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+        if ids_and_weights.weight_tensor is not None:
+          raise ValueError(
+              'crossed_column does not support weight_tensor, but the given '
+              'column populates weight_tensor. '
+              'Given column: {}'.format(key.name))
+        feature_tensors.append(ids_and_weights.id_tensor)
+      else:
+        raise ValueError('Unsupported column type. Given: {}'.format(key))
+    return sparse_ops._sparse_cross_hashed(  # pylint: disable=protected-access
+        inputs=feature_tensors,
+        num_buckets=self.hash_bucket_size,
+        hash_key=self.hash_key)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.hash_bucket_size
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+def _collect_leaf_level_keys(cross):
+  """Collects base keys by expanding all nested crosses.
+
+  Args:
+    cross: A `_CrossedColumn`.
+
+  Returns:
+    A list of strings or `_CategoricalColumn` instances.
+  """
+  leaf_level_keys = []
+  for k in cross.keys:
+    if isinstance(k, _CrossedColumn):
+      leaf_level_keys.extend(_collect_leaf_level_keys(k))
+    else:
+      leaf_level_keys.append(k)
+  return leaf_level_keys
+
+
+# TODO(zakaria): Move this to embedding_ops and make it public.
+def _safe_embedding_lookup_sparse(embedding_weights,
+                                  sparse_ids,
+                                  sparse_weights=None,
+                                  combiner='mean',
+                                  default_id=None,
+                                  name=None,
+                                  partition_strategy='div',
+                                  max_norm=None):
+  """Lookup embedding results, accounting for invalid IDs and empty features.
+
+  The partitioned embedding in `embedding_weights` must all be the same shape
+  except for the first dimension. The first dimension is allowed to vary as the
+  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
+  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
+  with non-positive weight. For an entry with no features, the embedding vector
+  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
+
+  The ids and weights may be multi-dimensional. Embeddings are always aggregated
+  along the last dimension.
+
+  Args:
+    embedding_weights:  A list of `P` float `Tensor`s or values representing
+        partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
+        created by partitioning along dimension 0.  The total unpartitioned
+        shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
+        vocab size and `e_1, ..., e_m` are the embedding dimensions.
+    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
+        ids. `d_0` is typically batch size.
+    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
+        float weights corresponding to `sparse_ids`, or `None` if all weights
+        are be assumed to be 1.0.
+    combiner: A string specifying how to combine embedding results for each
+        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
+        the default.
+    default_id: The id to use for an entry with no features.
+    name: A name for this operation (optional).
+    partition_strategy: A string specifying the partitioning strategy.
+        Currently `"div"` and `"mod"` are supported. Default is `"div"`.
+    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
+        combining.
+
+
+  Returns:
+    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
+
+  Raises:
+    ValueError: if `embedding_weights` is empty.
+  """
+  if embedding_weights is None:
+    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
+  if isinstance(embedding_weights, variables.PartitionedVariable):
+    embedding_weights = list(embedding_weights)  # get underlying Variables.
+  if not isinstance(embedding_weights, list):
+    embedding_weights = [embedding_weights]
+  if len(embedding_weights) < 1:
+    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
+
+  dtype = sparse_weights.dtype if sparse_weights is not None else None
+  embedding_weights = [
+      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+  ]
+
+  with ops.name_scope(name, 'embedding_lookup',
+                      embedding_weights + [sparse_ids,
+                                           sparse_weights]) as scope:
+    # Reshape higher-rank sparse ids and weights to linear segment ids.
+    original_shape = sparse_ids.dense_shape
+    original_rank_dim = sparse_ids.dense_shape.get_shape()[0]
+    original_rank = (
+        array_ops.size(original_shape)
+        if original_rank_dim.value is None
+        else original_rank_dim.value)
+    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
+        math_ops.reduce_prod(
+            array_ops.slice(original_shape, [0], [original_rank - 1])),
+        array_ops.gather(original_shape, original_rank - 1)])
+    if sparse_weights is not None:
+      sparse_weights = sparse_tensor_lib.SparseTensor(
+          sparse_ids.indices,
+          sparse_weights.values, sparse_ids.dense_shape)
+
+    # Prune invalid ids and weights.
+    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+
+    # Fill in dummy values for empty features, if necessary.
+    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
+                                                                 default_id or
+                                                                 0)
+    if sparse_weights is not None:
+      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
+
+    result = embedding_ops.embedding_lookup_sparse(
+        embedding_weights,
+        sparse_ids,
+        sparse_weights,
+        combiner=combiner,
+        partition_strategy=partition_strategy,
+        name=None if default_id is None else scope,
+        max_norm=max_norm)
+
+    if default_id is None:
+      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
+      # for use in Select.
+      is_row_empty = array_ops.tile(
+          array_ops.reshape(is_row_empty, [-1, 1]),
+          array_ops.stack([1, array_ops.shape(result)[1]]))
+
+      result = array_ops.where(is_row_empty,
+                               array_ops.zeros_like(result),
+                               result,
+                               name=scope)
+
+    # Reshape back from linear ids back into higher-dimensional dense result.
+    final_result = array_ops.reshape(
+        result,
+        array_ops.concat([
+            array_ops.slice(
+                math_ops.cast(original_shape, dtypes.int32), [0],
+                [original_rank - 1]),
+            array_ops.slice(array_ops.shape(result), [1], [-1])
+        ], 0))
+    final_result.set_shape(tensor_shape.unknown_shape(
+        (original_rank_dim - 1).value).concatenate(result.get_shape()[1:]))
+    return final_result
+
+
+def _prune_invalid_ids(sparse_ids, sparse_weights):
+  """Prune invalid IDs (< 0) from the input ids and weights."""
+  is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
+  if sparse_weights is not None:
+    is_id_valid = math_ops.logical_and(
+        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+  sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
+  if sparse_weights is not None:
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
+  return sparse_ids, sparse_weights
+
+
+class _IndicatorColumn(_DenseColumn,
+                       collections.namedtuple('_IndicatorColumn',
+                                              ['categorical_column'])):
+  """Represents a one-hot column for use in deep networks.
+
+  Args:
+    categorical_column: A `_CategoricalColumn` which is created by
+      `categorical_column_with_*` function.
+  """
+
+  @property
+  def name(self):
+    return '{}_indicator'.format(self.categorical_column.name)
+
+  def _transform_feature(self, inputs):
+    """Returns dense `Tensor` representing feature.
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+
+    Returns:
+      Transformed feature `Tensor`.
+    """
+    id_weight_pair = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    id_tensor = id_weight_pair.id_tensor
+    weight_tensor = id_weight_pair.weight_tensor
+
+    # If the underlying column is weighted, return the input as a dense tensor.
+    if weight_tensor is not None:
+      weighted_column = sparse_ops.sparse_merge(
+          sp_ids=id_tensor,
+          sp_values=weight_tensor,
+          vocab_size=self._variable_shape[-1])
+      return sparse_ops.sparse_tensor_to_dense(weighted_column)
+
+    dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
+        id_tensor, default_value=-1)
+
+    # One hot must be float for tf.concat reasons since all other inputs to
+    # input_layer are float32.
+    one_hot_id_tensor = array_ops.one_hot(
+        dense_id_tensor,
+        depth=self._variable_shape[-1],
+        on_value=1.0,
+        off_value=0.0)
+
+    # Reduce to get a multi-hot per example.
+    return math_ops.reduce_sum(one_hot_id_tensor, axis=[1])
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
+  @property
+  def _variable_shape(self):
+    """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
+    return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    """Returns dense `Tensor` representing feature.
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+      weight_collections: Unused `weight_collections` since no variables are
+        created in this function.
+      trainable: Unused `trainable` bool since no variables are created in
+        this function.
+
+    Returns:
+      Dense `Tensor` created within `_transform_feature`.
+    """
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    return inputs.get(self)
+
+
+def _verify_static_batch_size_equality(tensors, columns):
+  # bath_size is a tf.Dimension object.
+  expected_batch_size = None
+  for i in range(0, len(tensors)):
+    if tensors[i].shape[0].value is not None:
+      if expected_batch_size is None:
+        bath_size_column_index = i
+        expected_batch_size = tensors[i].shape[0]
+      elif not expected_batch_size.is_compatible_with(tensors[i].shape[0]):
+        raise ValueError(
+            'Batch size (first dimension) of each feature must be same. '
+            'Batch size of columns ({}, {}): ({}, {})'.format(
+                columns[bath_size_column_index].name, columns[i].name,
+                expected_batch_size, tensors[i].shape[0]))
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a57986764f9f5e2cff788817cc7706089dc73b0
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""FeatureColumns: tools for ingesting and representing features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.python.feature_column.feature_column import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long
+
+_allowed_symbols = [
+    'input_layer',
+    'linear_model',
+    'make_parse_example_spec',
+    'embedding_column',
+    'crossed_column',
+    'numeric_column',
+    'bucketized_column',
+    'categorical_column_with_hash_bucket',
+    'categorical_column_with_vocabulary_file',
+    'categorical_column_with_vocabulary_list',
+    'categorical_column_with_identity',
+    'weighted_categorical_column',
+    'indicator_column',
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..83250b5d1db4912c41a46a96497c75b7729bfb4e
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -0,0 +1,3765 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+
+import numpy as np
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.client import session
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column.feature_column import _CategoricalColumn
+from tensorflow.python.feature_column.feature_column import _DenseColumn
+from tensorflow.python.feature_column.feature_column import _FeatureColumn
+from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.feature_column.feature_column import _transform_features
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+
+
+def _initialized_session():
+  sess = session.Session()
+  sess.run(variables_lib.global_variables_initializer())
+  sess.run(lookup_ops.tables_initializer())
+  return sess
+
+
+class LazyColumnTest(test.TestCase):
+
+  def test_transormations_called_once(self):
+
+    class TransformCounter(_FeatureColumn):
+
+      def __init__(self):
+        self.num_transform = 0
+
+      @property
+      def name(self):
+        return 'TransformCounter'
+
+      def _transform_feature(self, cache):
+        self.num_transform += 1  # Count transform calls.
+        return cache.get('a')
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    column = TransformCounter()
+    self.assertEqual(0, column.num_transform)
+    builder.get(column)
+    self.assertEqual(1, column.num_transform)
+    builder.get(column)
+    self.assertEqual(1, column.num_transform)
+
+  def test_returns_transform_output(self):
+
+    class Transformer(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'Transformer'
+
+      def _transform_feature(self, cache):
+        return 'Output'
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    column = Transformer()
+    self.assertEqual('Output', builder.get(column))
+    self.assertEqual('Output', builder.get(column))
+
+  def test_does_not_pollute_given_features_dict(self):
+
+    class Transformer(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'Transformer'
+
+      def _transform_feature(self, cache):
+        return 'Output'
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    features = {'a': [[2], [3.]]}
+    builder = _LazyBuilder(features=features)
+    builder.get(Transformer())
+    self.assertEqual(['a'], list(features.keys()))
+
+  def test_error_if_feature_is_not_found(self):
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    with self.assertRaisesRegexp(ValueError,
+                                 'bbb is not in features dictionary'):
+      builder.get('bbb')
+
+  def test_not_supported_feature_column(self):
+
+    class NotAProperColumn(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'NotAProperColumn'
+
+      def _transform_feature(self, cache):
+        # It should return not None.
+        pass
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    with self.assertRaisesRegexp(ValueError,
+                                 'NotAProperColumn is not supported'):
+      builder.get(NotAProperColumn())
+
+  def test_key_should_be_string_or_feature_colum(self):
+
+    class NotAFeatureColumn(object):
+      pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    with self.assertRaisesRegexp(
+        TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
+      builder.get(NotAFeatureColumn())
+
+
+class NumericColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    a = fc.numeric_column('aaa')
+    self.assertEqual('aaa', a.key)
+    self.assertEqual((1,), a.shape)
+    self.assertIsNone(a.default_value)
+    self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
+
+  def test_shape_saved_as_tuple(self):
+    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    self.assertEqual((1, 2), a.shape)
+
+  def test_default_value_saved_as_tuple(self):
+    a = fc.numeric_column('aaa', default_value=4.)
+    self.assertEqual((4.,), a.default_value)
+    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    self.assertEqual(((3., 2.),), a.default_value)
+
+  def test_shape_and_default_value_compatibility(self):
+    fc.numeric_column('aaa', shape=[2], default_value=[1, 2.])
+    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+      fc.numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
+    fc.numeric_column(
+        'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]])
+    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+      fc.numeric_column(
+          'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]])
+    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+      fc.numeric_column(
+          'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]])
+
+  def test_default_value_type_check(self):
+    fc.numeric_column(
+        'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32)
+    fc.numeric_column(
+        'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32)
+    with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'):
+      fc.numeric_column(
+          'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32)
+    with self.assertRaisesRegexp(TypeError,
+                                 'default_value must be compatible with dtype'):
+      fc.numeric_column('aaa', default_value=['string'])
+
+  def test_shape_must_be_positive_integer(self):
+    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+      fc.numeric_column(
+          'aaa', shape=[
+              1.0,
+          ])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'shape dimensions must be greater than 0'):
+      fc.numeric_column(
+          'aaa', shape=[
+              0,
+          ])
+
+  def test_dtype_is_convertable_to_float(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'dtype must be convertible to float'):
+      fc.numeric_column('aaa', dtype=dtypes.string)
+
+  def test_scalar_deafult_value_fills_the_shape(self):
+    a = fc.numeric_column('aaa', shape=[2, 3], default_value=2.)
+    self.assertEqual(((2., 2., 2.), (2., 2., 2.)), a.default_value)
+
+  def test_parse_spec(self):
+    a = fc.numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
+    self.assertEqual({
+        'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
+    }, a._parse_example_spec)
+
+  def test_parse_example_no_default_value(self):
+    price = fc.numeric_column('price', shape=[2])
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([price]))
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+
+  def test_parse_example_with_default_value(self):
+    price = fc.numeric_column('price', shape=[2], default_value=11.)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    no_data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'something_else':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString(),
+                    no_data.SerializeToString()],
+        features=fc.make_parse_example_spec([price]))
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
+
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      fc.numeric_column('price', normalizer_fn='NotACallable')
+
+  def test_normalizer_fn_transform_feature(self):
+
+    def _increment_two(input_tensor):
+      return input_tensor + 2.
+
+    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price])
+    with self.test_session():
+      self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
+
+  def test_get_dense_tensor(self):
+
+    def _increment_two(input_tensor):
+      return input_tensor + 2.
+
+    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    builder = _LazyBuilder({'price': [[1., 2.], [5., 6.]]})
+    self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
+
+  def test_sparse_tensor_not_supported(self):
+    price = fc.numeric_column('price')
+    builder = _LazyBuilder({
+        'price':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
+    })
+    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+      price._transform_feature(builder)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
+    a_copy = copy.deepcopy(a)
+    self.assertEqual(a_copy.name, 'aaa')
+    self.assertEqual(a_copy.shape, (1, 2))
+    self.assertEqual(a_copy.default_value, ((3., 2.),))
+
+  def test_numpy_default_value(self):
+    a = fc.numeric_column(
+        'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
+    self.assertEqual(a.default_value, ((3., 2.),))
+
+  def test_linear_model(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[10.], [50.]], predictions.eval())
+
+
+class BucketizedColumnTest(test.TestCase):
+
+  def test_invalid_source_column_type(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'source_column must be a column generated with numeric_column'):
+      fc.bucketized_column(a, boundaries=[0, 1])
+
+  def test_invalid_source_column_shape(self):
+    a = fc.numeric_column('aaa', shape=[2, 3])
+    with self.assertRaisesRegexp(
+        ValueError, 'source_column must be one-dimensional column'):
+      fc.bucketized_column(a, boundaries=[0, 1])
+
+  def test_invalid_boundaries(self):
+    a = fc.numeric_column('aaa')
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=None)
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=1.)
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=[1, 0])
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=[1, 1])
+
+  def test_name(self):
+    a = fc.numeric_column('aaa', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertEqual('aaa_bucketized', b.name)
+
+  def test_parse_spec(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertEqual({
+        'aaa': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32)
+    }, b._parse_example_spec)
+
+  def test_variable_shape(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    # Column 'aaa` has shape [2] times three buckets -> variable_shape=[2, 3].
+    self.assertAllEqual((2, 3), b._variable_shape)
+
+  def test_num_buckets(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
+    self.assertEqual(6, b._num_buckets)
+
+  def test_parse_example(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([bucketized_price]))
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+
+  def test_transform_feature(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      transformed_tensor = _transform_features({
+          'price': [[-1., 1.], [5., 6.]]
+      }, [bucketized_price])
+      with _initialized_session():
+        self.assertAllEqual([[0, 1], [3, 4]],
+                            transformed_tensor[bucketized_price].eval())
+
+  def test_get_dense_tensor_one_input_value(self):
+    """Tests _get_dense_tensor() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
+      with _initialized_session():
+        bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
+        self.assertAllClose(
+            # One-hot tensor.
+            [[[1., 0., 0., 0., 0.]],
+             [[0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.]],
+             [[0., 0., 0., 0., 1.]]],
+            bucketized_price_tensor.eval())
+
+  def test_get_dense_tensor_two_input_values(self):
+    """Tests _get_dense_tensor() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
+      with _initialized_session():
+        bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
+        self.assertAllClose(
+            # One-hot tensor.
+            [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
+            bucketized_price_tensor.eval())
+
+  def test_get_sparse_tensors_one_input_value(self):
+    """Tests _get_sparse_tensors() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
+      with _initialized_session() as sess:
+        id_weight_pair = bucketized_price._get_sparse_tensors(builder)
+        self.assertIsNone(id_weight_pair.weight_tensor)
+        id_tensor_value = sess.run(id_weight_pair.id_tensor)
+        self.assertAllEqual(
+            [[0, 0], [1, 0], [2, 0], [3, 0]], id_tensor_value.indices)
+        self.assertAllEqual([0, 1, 3, 4], id_tensor_value.values)
+        self.assertAllEqual([4, 1], id_tensor_value.dense_shape)
+
+  def test_get_sparse_tensors_two_input_values(self):
+    """Tests _get_sparse_tensors() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
+      with _initialized_session() as sess:
+        id_weight_pair = bucketized_price._get_sparse_tensors(builder)
+        self.assertIsNone(id_weight_pair.weight_tensor)
+        id_tensor_value = sess.run(id_weight_pair.id_tensor)
+        self.assertAllEqual(
+            [[0, 0], [0, 1], [1, 0], [1, 1]], id_tensor_value.indices)
+        # Values 0-4 correspond to the first column of the input price.
+        # Values 5-9 correspond to the second column of the input price.
+        self.assertAllEqual([0, 6, 3, 9], id_tensor_value.values)
+        self.assertAllEqual([2, 2], id_tensor_value.dense_shape)
+
+  def test_sparse_tensor_input_not_supported(self):
+    price = fc.numeric_column('price')
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+    builder = _LazyBuilder({
+        'price':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
+    })
+    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+      bucketized_price._transform_feature(builder)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('aaa', shape=[2])
+    a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
+    a_bucketized_copy = copy.deepcopy(a_bucketized)
+    self.assertEqual(a_bucketized_copy.name, 'aaa_bucketized')
+    self.assertAllEqual(a_bucketized_copy._variable_shape, (2, 3))
+    self.assertEqual(a_bucketized_copy.boundaries, (0, 1))
+
+  def test_linear_model_one_input_value(self):
+    """Tests linear_model() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
+      predictions = fc.linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        sess.run(bucketized_price_var.assign(
+            [[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+
+  def test_linear_model_two_input_values(self):
+    """Tests linear_model() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1., 1.], [5., 6.]]}
+      predictions = fc.linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight per bucket per input column, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
+            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(bucketized_price_var.assign(
+            [[10.], [20.], [30.], [40.], [50.],
+             [60.], [70.], [80.], [90.], [100.]]))
+        # 1st example:
+        #   price -1. is in the 0th bucket, whose weight is 10.
+        #   price 1. is in the 6th bucket, whose weight is 70.
+        # 2nd example:
+        #   price 5. is in the 3rd bucket, whose weight is 40.
+        #   price 6. is in the 9th bucket, whose weight is 100.
+        self.assertAllClose([[80.], [140.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], predictions.eval())
+
+
+class HashedCategoricalColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    self.assertEqual('aaa', a.name)
+    self.assertEqual('aaa', a.key)
+    self.assertEqual(10, a.hash_bucket_size)
+    self.assertEqual(dtypes.string, a.dtype)
+
+  def test_bucket_size_should_be_given(self):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
+      fc.categorical_column_with_hash_bucket('aaa', None)
+
+  def test_bucket_size_should_be_positive(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'hash_bucket_size must be at least 1'):
+      fc.categorical_column_with_hash_bucket('aaa', 0)
+
+  def test_dtype_should_be_string_or_integer(self):
+    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
+    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_hash_bucket('aaa', 10)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(10, column.hash_bucket_size)
+      self.assertEqual(10, column._num_buckets)
+      self.assertEqual(dtypes.string, column.dtype)
+
+  def test_parse_spec_string(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.string)
+    }, a._parse_example_spec)
+
+  def test_parse_spec_int(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+    }, a._parse_example_spec)
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_strings_should_be_hashed(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    outputs = _transform_features({'wire': wire_tensor}, [hashed_sparse])
+    output = outputs[hashed_sparse]
+    # Check exact hashed output. If hashing changes this test will break.
+    expected_values = [6, 4, 1]
+    with self.test_session():
+      self.assertEqual(dtypes.int64, output.values.dtype)
+      self.assertAllEqual(expected_values, output.values.eval())
+      self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval())
+      self.assertAllEqual(wire_tensor.dense_shape.eval(),
+                          output.dense_shape.eval())
+
+  def test_tensor_dtype_should_be_string_or_integer(self):
+    string_fc = fc.categorical_column_with_hash_bucket(
+        'a_string', 10, dtype=dtypes.string)
+    int_fc = fc.categorical_column_with_hash_bucket(
+        'a_int', 10, dtype=dtypes.int32)
+    float_fc = fc.categorical_column_with_hash_bucket(
+        'a_float', 10, dtype=dtypes.string)
+    int_tensor = sparse_tensor.SparseTensor(
+        values=[101],
+        indices=[[0, 0]],
+        dense_shape=[1, 1])
+    string_tensor = sparse_tensor.SparseTensor(
+        values=['101'],
+        indices=[[0, 0]],
+        dense_shape=[1, 1])
+    float_tensor = sparse_tensor.SparseTensor(
+        values=[101.],
+        indices=[[0, 0]],
+        dense_shape=[1, 1])
+    builder = _LazyBuilder({
+        'a_int': int_tensor,
+        'a_string': string_tensor,
+        'a_float': float_tensor
+    })
+    builder.get(string_fc)
+    builder.get(int_fc)
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      builder.get(float_fc)
+
+  def test_dtype_should_match_with_tensor(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket(
+        'wire', 10, dtype=dtypes.int64)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+    builder = _LazyBuilder({'wire': wire_tensor})
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      builder.get(hashed_sparse)
+
+  def test_ints_should_be_hashed(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket(
+        'wire', 10, dtype=dtypes.int64)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=[101, 201, 301],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    builder = _LazyBuilder({'wire': wire_tensor})
+    output = builder.get(hashed_sparse)
+    # Check exact hashed output. If hashing changes this test will break.
+    expected_values = [3, 7, 5]
+    with self.test_session():
+      self.assertAllEqual(expected_values, output.values.eval())
+
+  def test_int32_64_is_compatible(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket(
+        'wire', 10, dtype=dtypes.int64)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=constant_op.constant([101, 201, 301], dtype=dtypes.int32),
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    builder = _LazyBuilder({'wire': wire_tensor})
+    output = builder.get(hashed_sparse)
+    # Check exact hashed output. If hashing changes this test will break.
+    expected_values = [3, 7, 5]
+    with self.test_session():
+      self.assertAllEqual(expected_values, output.values.eval())
+
+  def test_get_sparse_tensors(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    builder = _LazyBuilder({
+        'wire':
+            sparse_tensor.SparseTensor(
+                values=['omar', 'stringer', 'marlo'],
+                indices=[[0, 0], [1, 0], [1, 1]],
+                dense_shape=[2, 2])
+    })
+    id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_hash_bucket('aaa', 10)
+    inputs = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    builder = _LazyBuilder({'wire': (('omar', ''), ('stringer', 'marlo'))})
+    id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
+
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          wire_column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 3: wire_var[3] = 4
+        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
+
+class CrossedColumnTest(test.TestCase):
+
+  def test_keys_empty(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'keys must be a list with length > 1'):
+      fc.crossed_column([], 10)
+
+  def test_keys_length_one(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'keys must be a list with length > 1'):
+      fc.crossed_column(['a'], 10)
+
+  def test_key_type_unsupported(self):
+    with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
+      fc.crossed_column(['a', fc.numeric_column('c')], 10)
+
+    with self.assertRaisesRegexp(
+        ValueError, '_HashedCategoricalColumn is not supported'):
+      fc.crossed_column(
+          ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
+
+  def test_hash_bucket_size_negative(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'hash_bucket_size must be > 1'):
+      fc.crossed_column(['a', 'c'], -1)
+
+  def test_hash_bucket_size_zero(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'hash_bucket_size must be > 1'):
+      fc.crossed_column(['a', 'c'], 0)
+
+  def test_hash_bucket_size_none(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'hash_bucket_size must be > 1'):
+      fc.crossed_column(['a', 'c'], None)
+
+  def test_name(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_name_ordered_alphabetically(self):
+    """Tests that the name does not depend on the order of given columns."""
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+
+    crossed2 = fc.crossed_column([crossed1, 'c', b], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_name_leaf_keys_ordered_alphabetically(self):
+    """Tests that the name does not depend on the order of given columns."""
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d2', 'c'], 10)
+
+    crossed2 = fc.crossed_column([crossed1, 'd1', b], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_parse_spec(self):
+    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed = fc.crossed_column([b, 'c'], 10)
+    self.assertEqual({
+        'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32),
+        'c': parsing_ops.VarLenFeature(dtypes.string),
+    }, crossed._parse_example_spec)
+
+  def test_num_buckets(self):
+    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed = fc.crossed_column([b, 'c'], 15)
+    self.assertEqual(15, crossed._num_buckets)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    crossed2_copy = copy.deepcopy(crossed2)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
+    self.assertEqual(15, crossed2_copy.hash_bucket_size)
+    self.assertEqual(5, crossed2_copy.hash_key)
+
+  def test_parse_example(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.])),
+            'wire':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer'])),
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([price_cross_wire]))
+    self.assertIn('price', features)
+    self.assertIn('wire', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+      wire_sparse = features['wire']
+      self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
+      # Use byte constants to pass the open-source test.
+      self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
+      self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
+
+  def test_transform_feature(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    hash_bucket_size = 10
+    price_cross_wire = fc.crossed_column(
+        [bucketized_price, 'wire'], hash_bucket_size)
+    features = {
+        'price': constant_op.constant([[1., 2.], [5., 6.]]),
+        'wire': sparse_tensor.SparseTensor(
+            values=['omar', 'stringer', 'marlo'],
+            indices=[[0, 0], [1, 0], [1, 1]],
+            dense_shape=[2, 2]),
+    }
+    outputs = _transform_features(features, [price_cross_wire])
+    output = outputs[price_cross_wire]
+    with self.test_session() as sess:
+      output_val = sess.run(output)
+      self.assertAllEqual(
+          [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
+      for val in output_val.values:
+        self.assertIn(val, list(range(hash_bucket_size)))
+      self.assertAllEqual([2, 4], output_val.dense_shape)
+
+  def test_get_sparse_tensors(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+          'd1':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['d1A', 'd1B', 'd1C'],
+                  dense_shape=(2, 2)),
+          'd2':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['d2A', 'd2B', 'd2C'],
+                  dense_shape=(2, 2)),
+      })
+      id_weight_pair = crossed2._get_sparse_tensors(builder)
+      with _initialized_session():
+        id_tensor_eval = id_weight_pair.id_tensor.eval()
+        self.assertAllEqual(
+            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
+             (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
+             (1, 14), (1, 15)),
+            id_tensor_eval.indices)
+        # Check exact hashed output. If hashing changes this test will break.
+        # All values are within [0, hash_bucket_size).
+        expected_values = (
+            6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0, 10, 11)
+        self.assertAllEqual(expected_values, id_tensor_eval.values)
+        self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
+
+  def test_get_sparse_tensors_simple(self):
+    """Same as test_get_sparse_tensors, but with simpler values."""
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+      })
+      id_weight_pair = crossed._get_sparse_tensors(builder)
+      with _initialized_session():
+        id_tensor_eval = id_weight_pair.id_tensor.eval()
+        self.assertAllEqual(
+            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
+            id_tensor_eval.indices)
+        # Check exact hashed output. If hashing changes this test will break.
+        # All values are within [0, hash_bucket_size).
+        expected_values = (1, 0, 1, 3, 4, 2)
+        self.assertAllEqual(expected_values, id_tensor_eval.values)
+        self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
+
+  def test_linear_model(self):
+    """Tests linear_model.
+
+    Uses data from test_get_sparse_tesnsors_simple.
+    """
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'a': constant_op.constant(((-1., .5), (.5, 1.))),
+          'c': sparse_tensor.SparseTensor(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=['cA', 'cB', 'cC'],
+              dense_shape=(2, 2)),
+      }, (crossed,))
+      bias = get_linear_model_bias()
+      crossed_var = get_linear_model_column_var(crossed)
+      with _initialized_session() as sess:
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(
+            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+
+  def test_linear_model_with_weights(self):
+    class _TestColumnWithWeights(_CategoricalColumn):
+      """Produces sparse IDs and sparse weights."""
+
+      @property
+      def name(self):
+        return 'test_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {
+            self.name: parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name): parsing_ops.VarLenFeature(
+                dtypes.float32),
+            }
+
+      @property
+      def _num_buckets(self):
+        return 5
+
+      def _transform_feature(self, inputs):
+        return (inputs.get(self.name),
+                inputs.get('{}_weights'.format(self.name)))
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = inputs.get(self)
+        return _CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        fc.linear_model({
+            t.name: sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[0, 1, 2],
+                dense_shape=(2, 2)),
+            '{}_weights'.format(t.name): sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[1., 10., 2.],
+                dense_shape=(2, 2)),
+            'c': sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=['cA', 'cB', 'cC'],
+                dense_shape=(2, 2)),
+        }, (crossed,))
+
+
+def get_linear_model_bias():
+  with variable_scope.variable_scope('linear_model', reuse=True):
+    return variable_scope.get_variable('bias_weights')
+
+
+def get_linear_model_column_var(column):
+  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                            'linear_model/' + column.name)[0]
+
+
+class LinearModelTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.linear_model(features={}, feature_columns=[])
+
+  def test_should_be_feature_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+      fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
+
+  def test_should_be_dense_or_categorical_column(self):
+
+    class NotSupportedColumn(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'NotSupportedColumn'
+
+      def _transform_feature(self, cache):
+        pass
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    with self.assertRaisesRegexp(
+        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.linear_model(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_dense_bias(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        sess.run(price_var.assign([[10.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[15.], [55.]], predictions.eval())
+
+  def test_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_and_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [wire_cast, price])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+
+  def test_dense_and_sparse_column(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _DenseAndSparseColumn(_DenseColumn, _CategoricalColumn):
+
+      @property
+      def name(self):
+        return 'dense_and_sparse_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
+
+      def _transform_feature(self, inputs):
+        return inputs.get(self.name)
+
+      @property
+      def _variable_shape(self):
+        raise ValueError('Should not use this method.')
+
+      def _get_dense_tensor(self, inputs, weight_collections=None,
+                            trainable=None):
+        raise ValueError('Should not use this method.')
+
+      @property
+      def _num_buckets(self):
+        return 4
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        sp_tensor = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [1, 0], [1, 1]],
+            values=[2, 0, 3],
+            dense_shape=[2, 2])
+        return _CategoricalColumn.IdWeightPair(sp_tensor, None)
+
+    dense_and_sparse_column = _DenseAndSparseColumn()
+    with ops.Graph().as_default():
+      sp_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {dense_and_sparse_column.name: sp_tensor}
+      predictions = fc.linear_model(features, [dense_and_sparse_column])
+      bias = get_linear_model_bias()
+      dense_and_sparse_column_var = get_linear_model_column_var(
+          dense_and_sparse_column)
+      with _initialized_session() as sess:
+        sess.run(dense_and_sparse_column_var.assign(
+            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_multi_output(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        sess.run(price_var.assign([[10., 100., 1000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
+                            predictions.eval())
+
+  def test_sparse_multi_output(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast], units=3)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        sess.run(
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
+                1000., 1100., 1200.
+            ], [10000., 11000., 12000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
+                            predictions.eval())
+
+  def test_dense_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price])
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_sparse_multi_rank(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
+      wire_value = sparse_tensor.SparseTensorValue(
+          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
+          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
+          dense_shape=[2, 2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(
+            np.zeros((2, 1)),
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        self.assertAllClose(
+            [[1010.], [11000.]],
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+
+  def test_sparse_combiner(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(
+          features, [wire_cast], sparse_combiner='mean')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+
+  def test_dense_multi_dimension_multi_output(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
+        sess.run(bias.assign([2., 3., 4.]))
+        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
+                            predictions.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      with _initialized_session():
+        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+          predictions.eval()
+
+  def test_dense_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_dense_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price1_var.eval())
+        self.assertAllClose([[0.]], price2_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price1_var.assign([[10.], [100.]]))
+        sess.run(price2_var.assign([[1000.]]))
+        sess.run(bias.assign([7.]))
+        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+
+  def test_dense_collection(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      self.assertIn(bias, my_vars)
+      self.assertIn(price_var, my_vars)
+
+  def test_sparse_collection(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(
+          features, [wire_cast], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, my_vars)
+      self.assertIn(wire_cast_var, my_vars)
+
+  def test_dense_trainable_default(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(price_var, trainable_vars)
+
+  def test_sparse_trainable_default(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast])
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(wire_cast_var, trainable_vars)
+
+  def test_dense_trainable_false(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_sparse_trainable_false(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [price_a, wire_cast, price_b],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [wire_cast, price_b, price_a],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+      fc.linear_model(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.linear_model(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'must have the same size and shape'):
+          sess.run(
+              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            predictions,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+
+class InputLayerTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.input_layer(features={}, feature_columns=[])
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
+      fc.input_layer(
+          features={'a': [[0]]},
+          feature_columns=[
+              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+          ])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.input_layer(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.input_layer(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_one_column(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1.], [5.]], net.eval())
+
+  def test_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+          net.eval()
+
+  def test_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      net = fc.input_layer(features, [price1, price2])
+      with _initialized_session():
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    with ops.Graph().as_default():
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+      }
+      net1 = fc.input_layer(features, [price_a, price_b])
+      net2 = fc.input_layer(features, [price_b, price_a])
+      with _initialized_session():
+        self.assertAllClose([[1., 3.]], net1.eval())
+        self.assertAllClose([[1., 3.]], net2.eval())
+
+  def test_fails_for_categorical_column(self):
+    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      with self.assertRaisesRegexp(Exception, 'must be a _DenseColumn'):
+        fc.input_layer(features, [animal])
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.input_layer(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.input_layer(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      net = fc.input_layer(features, [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'Dimensions of inputs should match'):
+          sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      net = fc.input_layer(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            net,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+
+class MakeParseExampleSpecTest(test.TestCase):
+
+  class _TestFeatureColumn(_FeatureColumn,
+                           collections.namedtuple('_TestFeatureColumn',
+                                                  ['parse_spec'])):
+
+    @property
+    def _parse_example_spec(self):
+      return self.parse_spec
+
+  def test_no_feature_columns(self):
+    actual = fc.make_parse_example_spec([])
+    self.assertDictEqual({}, actual)
+
+  def test_invalid_type(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'All feature_columns must be _FeatureColumn instances.*invalid_column'):
+      fc.make_parse_example_spec(
+          (self._TestFeatureColumn({key1: parse_spec1}), 'invalid_column'))
+
+  def test_one_feature_column(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),))
+    self.assertDictEqual({key1: parse_spec1}, actual)
+
+  def test_two_feature_columns(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    key2 = 'key2'
+    parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),
+         self._TestFeatureColumn({key2: parse_spec2})))
+    self.assertDictEqual({key1: parse_spec1, key2: parse_spec2}, actual)
+
+  def test_equal_keys_different_parse_spec(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'feature_columns contain different parse_spec for key key1'):
+      fc.make_parse_example_spec(
+          (self._TestFeatureColumn({key1: parse_spec1}),
+           self._TestFeatureColumn({key1: parse_spec2})))
+
+  def test_equal_keys_equal_parse_spec(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),
+         self._TestFeatureColumn({key1: parse_spec1})))
+    self.assertDictEqual({key1: parse_spec1}, actual)
+
+  def test_multiple_features_dict(self):
+    """parse_spc for one column is a dict with length > 1."""
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    key2 = 'key2'
+    parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
+    key3 = 'key3'
+    parse_spec3 = parsing_ops.VarLenFeature(dtype=dtypes.int32)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),
+         self._TestFeatureColumn({key2: parse_spec2, key3: parse_spec3})))
+    self.assertDictEqual(
+        {key1: parse_spec1, key2: parse_spec2, key3: parse_spec3}, actual)
+
+
+def _assert_sparse_tensor_value(test_case, expected, actual):
+  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
+  test_case.assertAllEqual(expected.indices, actual.indices)
+
+  test_case.assertEqual(
+      np.array(expected.values).dtype, np.array(actual.values).dtype)
+  test_case.assertAllEqual(expected.values, actual.values)
+
+  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
+  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
+
+
+class VocabularyFileCategoricalColumnTest(test.TestCase):
+
+  def setUp(self):
+    super(VocabularyFileCategoricalColumnTest, self).setUp()
+
+    # Contains ints, Golden State Warriors jersey numbers: 30, 35, 11, 23, 22
+    self._warriors_vocabulary_file_name = test.test_src_dir_path(
+        'python/feature_column/testdata/warriors_vocabulary.txt')
+    self._warriors_vocabulary_size = 5
+
+    # Contains strings, character names from 'The Wire': omar, stringer, marlo
+    self._wire_vocabulary_file_name = test.test_src_dir_path(
+        'python/feature_column/testdata/wire_vocabulary.txt')
+    self._wire_vocabulary_size = 3
+
+  def test_defaults(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.string)
+    }, column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
+        num_oov_buckets=4, dtype=dtypes.int32)
+    self.assertEqual(7, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
+        num_oov_buckets=4, dtype=dtypes.int32)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(7, column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+      }, column._parse_example_spec)
+
+  def test_vocabulary_file_none(self):
+    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=None, vocabulary_size=3)
+
+  def test_vocabulary_file_empty_string(self):
+    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='', vocabulary_size=3)
+
+  def test_invalid_vocabulary_file(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
+      with self.test_session():
+        lookup_ops.tables_initializer().run()
+
+  def test_invalid_vocabulary_size(self):
+    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=None)
+    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=-1)
+    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=0)
+
+  def test_too_large_vocabulary_size(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size + 1)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
+      with self.test_session():
+        lookup_ops.tables_initializer().run()
+
+  def test_invalid_num_oov_buckets(self):
+    with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          num_oov_buckets=-1)
+
+  def test_invalid_dtype(self):
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          dtype=dtypes.float64)
+
+  def test_invalid_buckets_and_default_value(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'both num_oov_buckets and default_value'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size,
+          num_oov_buckets=100,
+          default_value=2)
+
+  def test_invalid_input_dtype_int32(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        dtype=dtypes.string)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(12, 24, 36),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_invalid_input_dtype_string(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_get_sparse_tensors(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_transform_feature(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(self,
+                                  sparse_tensor.SparseTensorValue(
+                                      indices=inputs.indices,
+                                      values=np.array(
+                                          (2, -1, 0), dtype=np.int64),
+                                      dense_shape=inputs.dense_shape),
+                                  id_tensor.eval())
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': (('marlo', ''), ('skywalker', 'omar'))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_default_value_in_vocabulary(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        default_value=2)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 2, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_oov_buckets(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=100)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (1, 2)),
+        values=('marlo', 'skywalker', 'omar', 'heisenberg'),
+        dense_shape=(2, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 33, 0, 62), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_small_vocabulary_size(self):
+    # 'marlo' is the last entry in our vocabulary file, so be setting
+    # `vocabulary_size` to 1 less than number of entries in file, we take
+    # 'marlo' out of the vocabulary.
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size - 1)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((-1, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=(11, 100, 30, 22),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32_dense_input(self):
+    default_value = -100
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32,
+        default_value=default_value)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+              values=np.array((2, default_value, 0, 4), dtype=np.int64),
+              dense_shape=(3, 3)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32_with_oov_buckets(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32,
+        num_oov_buckets=100)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=(11, 100, 30, 22),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 60, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          wire_column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
+
+class VocabularyListCategoricalColumnTest(test.TestCase):
+
+  def test_defaults_string(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.string)
+    }, column._parse_example_spec)
+
+  def test_defaults_int(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36))
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+        default_value=-99)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(3, column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+      }, column._parse_example_spec)
+
+  def test_invalid_dtype(self):
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          dtype=dtypes.float32)
+
+  def test_invalid_mapping_dtype(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'vocabulary dtype must be string or integer'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12., 24., 36.))
+
+  def test_mismatched_int_dtype(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          dtype=dtypes.int32)
+
+  def test_mismatched_string_dtype(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
+
+  def test_none_mapping(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'vocabulary_list.*must be non-empty'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=None)
+
+  def test_empty_mapping(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'vocabulary_list.*must be non-empty'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=tuple([]))
+
+  def test_duplicate_mapping(self):
+    with self.assertRaisesRegexp(ValueError, 'Duplicate keys'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 12))
+
+  def test_invalid_input_dtype_int32(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(12, 24, 36),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_invalid_input_dtype_string(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=(12, 24, 36))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_parse_example_string(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_parse_example_int(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(11, 21, 31))
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[11, 21]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=[11, 21],
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_get_sparse_tensors(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_transform_feature(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': (('marlo', ''), ('skywalker', 'omar'))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_default_value_in_vocabulary(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        default_value=2)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 2, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+        dtype=dtypes.int32)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=np.array((11, 100, 30, 22), dtype=np.int32),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32_dense_input(self):
+    default_value = -100
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+        dtype=dtypes.int32,
+        default_value=default_value)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa':
+                np.array(
+                    ((11, -1, -1), (100, 30, -1), (-1, -1, 22)), dtype=np.int32)
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+              values=np.array((2, default_value, 0, 4), dtype=np.int64),
+              dense_shape=(3, 3)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    self.assertEqual(3, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          wire_column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> None, 'omar' -> 0: wire_var[0] = 1
+        self.assertAllClose(((3.,), (1.,)), predictions.eval())
+
+
+class IdentityCategoricalColumnTest(test.TestCase):
+
+  def test_constructor(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(3, column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, column._parse_example_spec)
+
+  def test_invalid_num_buckets_zero(self):
+    with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'):
+      fc.categorical_column_with_identity(key='aaa', num_buckets=0)
+
+  def test_invalid_num_buckets_negative(self):
+    with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'):
+      fc.categorical_column_with_identity(key='aaa', num_buckets=-1)
+
+  def test_invalid_default_value_too_small(self):
+    with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'):
+      fc.categorical_column_with_identity(
+          key='aaa', num_buckets=3, default_value=-1)
+
+  def test_invalid_default_value_too_big(self):
+    with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'):
+      fc.categorical_column_with_identity(
+          key='aaa', num_buckets=3, default_value=3)
+
+  def test_invalid_input_dtype(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[11, 21]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([11, 21], dtype=np.int64),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_get_sparse_tensors(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_transform_feature(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': ((0, -1), (1, 0))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_inputs_too_small(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, -1, 0),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      with self.assertRaisesRegexp(
+          errors.OpError, 'assert_greater_or_equal_0'):
+        id_weight_pair.id_tensor.eval()
+
+  def test_get_sparse_tensors_with_inputs_too_big(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 99, 0),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      with self.assertRaisesRegexp(
+          errors.OpError, 'assert_less_than_num_buckets'):
+        id_weight_pair.id_tensor.eval()
+
+  def test_get_sparse_tensors_with_default_value(self):
+    column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=4, default_value=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, -1, 99),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((1, 3, 3), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
+    column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=4, default_value=3)
+    input_indices = array_ops.placeholder(dtype=dtypes.int64)
+    input_values = array_ops.placeholder(dtype=dtypes.int32)
+    input_shape = array_ops.placeholder(dtype=dtypes.int64)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=input_indices,
+        values=input_values,
+        dense_shape=input_shape)
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
+              values=np.array((1, 3, 3), dtype=np.int64),
+              dense_shape=np.array((2, 2), dtype=np.int64)),
+          id_weight_pair.id_tensor.eval(feed_dict={
+              input_indices: ((0, 0), (1, 0), (1, 1)),
+              input_values: (1, -1, 99),
+              input_shape: (2, 2),
+          }))
+
+  def test_linear_model(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual(3, column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] = 1
+        # weight_var[2] + weight_var[1] = 3+2 = 5
+        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
+
+class TransformFeaturesTest(test.TestCase):
+
+  # All transform tests are distributed in column test.
+  # Here we only test multi column case and naming
+  def transform_multi_column(self):
+    bucketized_price = fc.bucketized_column(
+        fc.numeric_column('price'), boundaries=[0, 2, 4, 6])
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    with ops.Graph().as_default():
+      features = {
+          'price': [[-1.], [5.]],
+          'wire':
+              sparse_tensor.SparseTensor(
+                  values=['omar', 'stringer', 'marlo'],
+                  indices=[[0, 0], [1, 0], [1, 1]],
+                  dense_shape=[2, 2])
+      }
+      transformed = _transform_features(features,
+                                        [bucketized_price, hashed_sparse])
+      with _initialized_session():
+        self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
+        self.assertAllEqual([[0], [3]], transformed[bucketized_price].eval())
+        self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
+        self.assertAllEqual([6, 4, 1], transformed[hashed_sparse].values.eval())
+
+  def test_column_order(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _LoggerColumn(_FeatureColumn):
+
+      def __init__(self, name):
+        self._name = name
+
+      @property
+      def name(self):
+        return self._name
+
+      def _transform_feature(self, inputs):
+        del inputs
+        self.call_order = call_logger['count']
+        call_logger['count'] += 1
+        return 'Anything'
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    with ops.Graph().as_default():
+      column1 = _LoggerColumn('1')
+      column2 = _LoggerColumn('2')
+      call_logger = {'count': 0}
+      _transform_features({}, [column1, column2])
+      self.assertEqual(0, column1.call_order)
+      self.assertEqual(1, column2.call_order)
+
+      call_logger = {'count': 0}
+      _transform_features({}, [column2, column1])
+      self.assertEqual(0, column1.call_order)
+      self.assertEqual(1, column2.call_order)
+
+
+class IndicatorColumnTest(test.TestCase):
+
+  def test_indicator_column(self):
+    a = fc.categorical_column_with_hash_bucket('a', 4)
+    indicator_a = fc.indicator_column(a)
+    self.assertEqual(indicator_a.categorical_column.name, 'a')
+    self.assertEqual(indicator_a._variable_shape, [1, 4])
+
+    b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    indicator_b = fc.indicator_column(b)
+    self.assertEqual(indicator_b.categorical_column.name, 'b')
+    self.assertEqual(indicator_b._variable_shape, [1, 100])
+
+  def test_1D_shape_fails(self):
+    with self.assertRaisesRegexp(ValueError, 'must have rank 2'):
+      _LazyBuilder({
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[0], values=['fox'], dense_shape=[1])
+      })
+
+  def test_2D_shape_succeeds(self):
+    # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
+    animal = fc.indicator_column(
+        fc.categorical_column_with_hash_bucket('animal', 4))
+    builder = _LazyBuilder({
+        'animal':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [1, 0]],
+                values=['fox', 'fox'],
+                dense_shape=[2, 1])
+    })
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+  def test_multi_hot(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+
+    builder = _LazyBuilder({
+        'animal':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [0, 1]], values=[1, 1], dense_shape=[1, 2])
+    })
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+
+  def test_multi_hot2(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    builder = _LazyBuilder({
+        'animal':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+    })
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
+
+  def test_deep_copy(self):
+    a = fc.categorical_column_with_hash_bucket('a', 4)
+    column = fc.indicator_column(a)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual(column_copy.categorical_column.name, 'a')
+    self.assertEqual(column.name, 'a_indicator')
+    self.assertEqual(column._variable_shape, [1, 4])
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_indicator = fc.indicator_column(a)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_indicator]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_transform(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_indicator = fc.indicator_column(a)
+    features = {
+        'aaa': sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=('marlo', 'skywalker', 'omar'),
+            dense_shape=(2, 2))
+    }
+    indicator_tensor = _transform_features(features, [a_indicator])[a_indicator]
+    with _initialized_session():
+      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+
+  def test_linear_model(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+
+      predictions = fc.linear_model(features, [animal])
+      weight_var = get_linear_model_column_var(animal)
+      with _initialized_session():
+        # All should be zero-initialized.
+        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
+        self.assertAllClose([[0.]], predictions.eval())
+        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
+        self.assertAllClose([[2. + 3.]], predictions.eval())
+
+  def test_input_layer(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = fc.input_layer(features, [animal])
+      with _initialized_session():
+        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+
+
+class EmbeddingColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension)
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    self.assertEqual('mean', embedding_column.combiner)
+    self.assertIsNotNone(embedding_column.initializer)
+    self.assertIsNone(embedding_column.ckpt_to_load_from)
+    self.assertIsNone(embedding_column.tensor_name_in_ckpt)
+    self.assertIsNone(embedding_column.max_norm)
+    self.assertTrue(embedding_column.trainable)
+    self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        combiner='my_combiner', initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42., trainable=False)
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    self.assertEqual('my_combiner', embedding_column.combiner)
+    self.assertEqual('my_initializer', embedding_column.initializer())
+    self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
+    self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
+    self.assertEqual(42., embedding_column.max_norm)
+    self.assertFalse(embedding_column.trainable)
+    self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column._parse_example_spec)
+
+  def test_deep_copy(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    original = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        combiner='my_combiner', initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42., trainable=False)
+    for embedding_column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', embedding_column.categorical_column.name)
+      self.assertEqual(3, embedding_column.categorical_column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, embedding_column.categorical_column._parse_example_spec)
+
+      self.assertEqual(embedding_dimension, embedding_column.dimension)
+      self.assertEqual('my_combiner', embedding_column.combiner)
+      self.assertEqual('my_initializer', embedding_column.initializer())
+      self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
+      self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
+      self.assertEqual(42., embedding_column.max_norm)
+      self.assertFalse(embedding_column.trainable)
+      self.assertEqual('aaa_embedding', embedding_column.name)
+      self.assertEqual(
+          (embedding_dimension,), embedding_column._variable_shape)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, embedding_column._parse_example_spec)
+
+  def test_invalid_initializer(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
+      fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_embedded = fc.embedding_column(a, dimension=2)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_embedded]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_transform_feature(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    a_embedded = fc.embedding_column(a, dimension=2)
+    features = {
+        'aaa': sparse_tensor.SparseTensor(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(0, 1, 0),
+            dense_shape=(2, 2))
+    }
+    outputs = _transform_features(features, [a, a_embedded])
+    output_a = outputs[a]
+    output_embedded = outputs[a_embedded]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self, output_a.eval(), output_embedded.eval())
+
+  def test_get_dense_tensor(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_get_dense_tensor_3d(self):
+    # Inputs.
+    vocabulary_size = 4
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0, 0), (1, 1, 0), (1, 1, 4), (3, 0, 0), (3, 1, 2)),
+        values=(2, 0, 1, 1, 2),
+        dense_shape=(4, 2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 3
+    embedding_values = (
+        (1., 2., 4.),   # id 0
+        (3., 5., 1.),   # id 1
+        (7., 11., 2.),  # id 2
+        (2., 7., 12.)   # id 3
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [[2], []], embedding = [[7, 11, 2], [0, 0, 0]]
+        ((7., 11., 2.), (0., 0., 0.)),
+        # example 1, ids [[], [0, 1]], embedding
+        # = mean([[], [1, 2, 4] + [3, 5, 1]]) = [[0, 0, 0], [2, 3.5, 2.5]]
+        ((0., 0., 0.), (2., 3.5, 2.5)),
+        # example 2, ids [[], []], embedding = [[0, 0, 0], [0, 0, 0]]
+        ((0., 0., 0.), (0., 0., 0.)),
+        # example 3, ids [[1], [2]], embedding = [[3, 5, 1], [7, 11, 2]]
+        ((3., 5., 1.), (7., 11., 2.)),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_get_dense_tensor_weight_collections(self):
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_column = fc.embedding_column(categorical_column, dimension=2)
+
+    # Provide sparse input and get dense result.
+    embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }), weight_collections=('my_vars',))
+
+    # Assert expected embedding variable and lookups.
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    my_vars = ops.get_collection('my_vars')
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in my_vars]))
+
+  def test_get_dense_tensor_placeholder_inputs(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    input_indices = array_ops.placeholder(dtype=dtypes.int64)
+    input_values = array_ops.placeholder(dtype=dtypes.int64)
+    input_shape = array_ops.placeholder(dtype=dtypes.int64)
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa':
+                sparse_tensor.SparseTensorValue(
+                    indices=input_indices,
+                    values=input_values,
+                    dense_shape=input_shape)
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval(
+          feed_dict={
+              input_indices: sparse_input.indices,
+              input_values: sparse_input.values,
+              input_shape: sparse_input.dense_shape,
+          }))
+
+  def test_get_dense_tensor_restore_from_ckpt(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable. The checkpoint file contains _embedding_values.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    ckpt_path = test.test_src_dir_path(
+        'python/feature_column/testdata/embedding.ckpt')
+    ckpt_tensor = 'my_embedding'
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        ckpt_to_load_from=ckpt_path,
+        tensor_name_in_ckpt=ckpt_tensor)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_linear_model(self):
+    # Inputs.
+    batch_size = 4
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(batch_size, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          categorical_column.name: sparse_input
+      }, (embedding_column,))
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_embedding/weights:0',
+          'linear_model/aaa_embedding/embedding_weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v for v in ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_embedding/embedding_weights:0']
+      linear_weights = trainable_vars[
+          'linear_model/aaa_embedding/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # example 2, ids [], embedding[2] = [0, 0]
+        # example 3, ids [1], embedding[3] = [3, 5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+
+  def test_input_layer(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_embedding/embedding_weights:0',),
+        tuple([v.name for v in global_vars]))
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_embedding/embedding_weights:0',),
+        tuple([v.name for v in trainable_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
+      self.assertAllEqual(expected_lookups, input_layer.eval())
+
+  def test_input_layer_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer, trainable=False)
+
+    # Provide sparse input and get dense result.
+    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_embedding/embedding_weights:0',),
+        tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, input_layer.eval())
+
+
+class WeightedCategoricalColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    self.assertEqual('ids_weighted_by_values', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'ids': parsing_ops.VarLenFeature(dtypes.int64),
+        'values': parsing_ops.VarLenFeature(dtypes.float32)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    """Tests deepcopy of categorical_column_with_hash_bucket."""
+    original = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('ids_weighted_by_values', column.name)
+      self.assertEqual(3, column._num_buckets)
+      self.assertEqual({
+          'ids': parsing_ops.VarLenFeature(dtypes.int64),
+          'values': parsing_ops.VarLenFeature(dtypes.float32)
+      }, column._parse_example_spec)
+
+  def test_invalid_dtype_none(self):
+    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+      fc.weighted_categorical_column(
+          categorical_column=fc.categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values',
+          dtype=None)
+
+  def test_invalid_dtype_string(self):
+    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+      fc.weighted_categorical_column(
+          categorical_column=fc.categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values',
+          dtype=dtypes.string)
+
+  def test_invalid_input_dtype(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    strings = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'Bad dtype'):
+      _transform_features({'ids': strings, 'values': strings}, (column,))
+
+  def test_column_name_collision(self):
+    with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
+      fc.weighted_categorical_column(
+          categorical_column=fc.categorical_column_with_identity(
+              key='aaa', num_buckets=3),
+          weight_feature_key='aaa')._parse_example_spec()
+
+  def test_missing_weights(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(
+        ValueError, 'values is not in features dictionary'):
+      _transform_features({'ids': inputs}, (column,))
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer'])),
+            'weights':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[1., 10.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_weighted]))
+    self.assertIn('aaa', features)
+    self.assertIn('weights', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([1., 10.], dtype=np.float32),
+              dense_shape=[1, 2]),
+          features['weights'].eval())
+
+  def test_transform_features(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    weights = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0.5, 1.0, 0.1),
+        dense_shape=(2, 2))
+    id_tensor, weight_tensor = _transform_features({
+        'ids': inputs,
+        'values': weights,
+    }, (column,))[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array(inputs.values, dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=weights.indices,
+              values=np.array(weights.values, dtype=np.float32),
+              dense_shape=weights.dense_shape),
+          weight_tensor.eval())
+
+  def test_transform_features_dense_input(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    weights = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0.5, 1.0, 0.1),
+        dense_shape=(2, 2))
+    id_tensor, weight_tensor = _transform_features({
+        'ids': ((0, -1), (1, 0)),
+        'values': weights,
+    }, (column,))[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=weights.indices,
+              values=np.array(weights.values, dtype=np.float32),
+              dense_shape=weights.dense_shape),
+          weight_tensor.eval())
+
+  def test_transform_features_dense_weights(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 1, 0),
+        dense_shape=(2, 2))
+    id_tensor, weight_tensor = _transform_features({
+        'ids': inputs,
+        'values': ((.5, 0.), (1., .1)),
+    }, (column,))[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array(inputs.values, dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((.5, 1., .1), dtype=np.float32),
+              dense_shape=(2, 2)),
+          weight_tensor.eval())
+
+  def test_linear_model(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'ids': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2)),
+          'values': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(.5, 1., .1),
+              dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  def test_linear_model_mismatched_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError, r'Dimensions.*are not compatible'):
+        fc.linear_model({
+            'ids': sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(0, 2, 1),
+                dense_shape=(2, 2)),
+            'values': sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (0, 1), (1, 0), (1, 1)),
+                values=(.5, 11., 1., .1),
+                dense_shape=(2, 2))
+        }, (column,))
+
+  def test_linear_model_mismatched_dense_values(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'ids': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2)),
+          'values': ((.5,), (1.,))
+      }, (column,))
+      with _initialized_session():
+        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+          predictions.eval()
+
+  def test_linear_model_mismatched_dense_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'ids': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2)),
+          'values': ((.5,), (1.,), (.1,))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  # TODO(ptucker): Add test with embedding of weighted categorical.
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/testdata/embedding.ckpt.data-00000-of-00001 b/tensorflow/python/feature_column/testdata/embedding.ckpt.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..5cc36d86d60d4a76b1cf005fe207e0d8af0f3f06
Binary files /dev/null and b/tensorflow/python/feature_column/testdata/embedding.ckpt.data-00000-of-00001 differ
diff --git a/tensorflow/python/feature_column/testdata/embedding.ckpt.index b/tensorflow/python/feature_column/testdata/embedding.ckpt.index
new file mode 100644
index 0000000000000000000000000000000000000000..c1f35a8fcfffed90eb44b3d784f998cafb59d3aa
Binary files /dev/null and b/tensorflow/python/feature_column/testdata/embedding.ckpt.index differ
diff --git a/tensorflow/python/feature_column/testdata/embedding.ckpt.meta b/tensorflow/python/feature_column/testdata/embedding.ckpt.meta
new file mode 100644
index 0000000000000000000000000000000000000000..65bc3f2becb000010273d8e9835e7e39d553f5c7
Binary files /dev/null and b/tensorflow/python/feature_column/testdata/embedding.ckpt.meta differ
diff --git a/tensorflow/python/feature_column/testdata/warriors_vocabulary.txt b/tensorflow/python/feature_column/testdata/warriors_vocabulary.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6c917fa699903c367734f220953f0c97a39bc9ef
--- /dev/null
+++ b/tensorflow/python/feature_column/testdata/warriors_vocabulary.txt
@@ -0,0 +1,5 @@
+30
+35
+11
+23
+22
diff --git a/tensorflow/python/feature_column/testdata/wire_vocabulary.txt b/tensorflow/python/feature_column/testdata/wire_vocabulary.txt
new file mode 100644
index 0000000000000000000000000000000000000000..32c6b5692a0d4c8b2935cd7b32f3a5396857ee3d
--- /dev/null
+++ b/tensorflow/python/feature_column/testdata/wire_vocabulary.txt
@@ -0,0 +1,3 @@
+omar
+stringer
+marlo
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index d373bac47a2a44e64ce989c7d5150cd42fea219a..3e6c04982b4b1c1ca219cfd1bc1a1954e2b520a1 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -270,6 +270,9 @@ class DType(object):
     """Returns the string name for this `DType`."""
     return _TYPE_TO_STRING[self._type_enum]
 
+  def __int__(self):
+    return self._type_enum
+
   def __str__(self):
     return "<dtype: %r>" % self.name
 
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index f04f67ffedd3663e44d836fd3faf046798f00509..5bb60763b6e30d23c622b1a281f62e3577c77692 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -45,8 +45,8 @@ class TypesTest(test_util.TensorFlowTestCase):
     for datatype_enum in types_pb2.DataType.values():
       if datatype_enum == types_pb2.DT_INVALID:
         continue
-      self.assertEqual(datatype_enum,
-                       dtypes.as_dtype(datatype_enum).as_datatype_enum)
+      dt = dtypes.as_dtype(datatype_enum)
+      self.assertEqual(datatype_enum, dt.as_datatype_enum)
 
   def testAllTypesConvertibleToNumpyDtype(self):
     for datatype_enum in types_pb2.DataType.values():
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 2a1389b91ffaeaf2fd19ee46ea704b0dcf383928..ac8aee2c83dcb99b93d3a7b2d527591e7f0b20b4 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -113,18 +113,19 @@ def _add_op_node(op, func, input_dict):
   node_def = func.node_def[-1]
   for i in range(len(node_def.input)):
     if not node_def.input[i].startswith("^"):
-      assert node_def.input[i] in input_dict, (
-          "%s missing from %s" % (node_def.input[i], input_dict.items()))
+      assert node_def.input[i] in input_dict, ("%s missing from %s" %
+                                               (node_def.input[i],
+                                                input_dict.items()))
       node_def.input[i] = input_dict[node_def.input[i]]
 
 
-def _graph_to_function_def(graph, inputs, outputs, out_names=None):
+def _graph_to_function_def(graph, operations, inputs, outputs, out_names=None):
   """Returns `graph` as a `FunctionDef` protocol buffer.
 
   This method creates a [`FunctionDef`](
   https://www.tensorflow.org/code/tensorflow/core/framework/function.proto)
-  protocol buffer that contains all the ops present in the graph.  The
-  graph effectively becomes the body of the function.
+  protocol buffer that contains all the ops in `operations`.  The
+  operations become the body of the function.
 
   The arguments `inputs` and `outputs` will be listed as the inputs
   and outputs tensors of the function.  They must be lists of
@@ -132,6 +133,8 @@ def _graph_to_function_def(graph, inputs, outputs, out_names=None):
 
   Args:
     graph: Graph.
+    operations: the operations to put in the function. Must be a subset of
+     the operations in the graph.
     inputs: List of tensors. Inputs to the function.
     outputs: List of tensors. Outputs of the function.
     out_names: Optional list of string names for the outputs.
@@ -145,12 +148,12 @@ def _graph_to_function_def(graph, inputs, outputs, out_names=None):
   func = function_pb2.FunctionDef()
   func.signature.name = "_"
   used_names = set()
-  func.signature.input_arg.extend([_tensor_to_argdef(i, used_names=used_names)
-                                   for i in inputs])
+  func.signature.input_arg.extend(
+      [_tensor_to_argdef(i, used_names=used_names) for i in inputs])
   if out_names is None:
     used_names = set()
-    func.signature.output_arg.extend([
-        _tensor_to_argdef(o, used_names=used_names) for o in outputs])
+    func.signature.output_arg.extend(
+        [_tensor_to_argdef(o, used_names=used_names) for o in outputs])
   elif len(outputs) != len(out_names):
     raise ValueError(
         "Length of out_names (%d) does not match number of outputs (%d): %s" %
@@ -159,12 +162,12 @@ def _graph_to_function_def(graph, inputs, outputs, out_names=None):
     raise ValueError(
         "Must not have duplicates in out_names: %s" % ", ".join(out_names))
   else:
-    func.signature.output_arg.extend([
-        _tensor_to_argdef(o, name=n) for o, n in zip(outputs, out_names)])
+    func.signature.output_arg.extend(
+        [_tensor_to_argdef(o, name=n) for o, n in zip(outputs, out_names)])
   func_arg_placeholders = set([i.name for i in inputs])
   input_dict = _create_input_dict(graph, func_arg_placeholders)
 
-  for op in graph.get_operations():
+  for op in operations:
     if _is_in_placeholders(op, func_arg_placeholders):
       continue
     _add_op_node(op, func, input_dict)
@@ -295,17 +298,18 @@ class _FuncGraph(ops.Graph):
     self.extra_args = []
     self.extra_vars = []
 
-  def getvar(self,
-             getter,
-             name,
-             shape=None,
-             dtype=None,
-             initializer=None,
-             reuse=None,
-             trainable=True,
-             collections=None,  # pylint: disable=redefined-outer-name
-             use_resource=None,
-             **kwargs):
+  def getvar(
+      self,
+      getter,
+      name,
+      shape=None,
+      dtype=None,
+      initializer=None,
+      reuse=None,
+      trainable=True,
+      collections=None,  # pylint: disable=redefined-outer-name
+      use_resource=None,
+      **kwargs):
     """A custom variable getter."""
     # Here, we switch the default graph to the outer graph and ask the
     # variable scope in which the function is defined to give us the
@@ -538,20 +542,23 @@ class _DefinedFunction(object):
 
     # Build the FunctionDef
     self._definition = _graph_to_function_def(
-        temp_graph, inputs, outputs, out_names=self._out_names)
+        temp_graph,
+        temp_graph.get_operations(),
+        inputs,
+        outputs,
+        out_names=self._out_names)
 
     # Extra kwargs are treated as attrs on the function def.
     sig_pre_func_name = self._func_name or _get_func_name(self._func)
-    kwargs_attr = _parse_kwargs_as_attrs(
-        sig_pre_func_name, **self._extra_kwargs)
+    kwargs_attr = _parse_kwargs_as_attrs(sig_pre_func_name,
+                                         **self._extra_kwargs)
     for k in kwargs_attr:
       self._definition.attr[k].CopyFrom(kwargs_attr[k])
 
     # Hash the definition and its dependencies.
     self._hash_str = self._create_hash_str(
         self._definition.signature.input_arg,
-        self._definition.signature.output_arg,
-        self._definition.node_def)
+        self._definition.signature.output_arg, self._definition.node_def)
 
     # Finally, we decide the function name to use.  If not specified,
     # make up something which is almost certainly unique (but deterministic).
@@ -658,8 +665,8 @@ def _from_definition(fdef, grad_func=None):
   # have access to such a callable here).
   func = None
   argnames = [arg.name for arg in fdef.signature.input_arg]
-  input_types = tuple(dtypes.as_dtype(arg.type)
-                      for arg in fdef.signature.input_arg)
+  input_types = tuple(
+      dtypes.as_dtype(arg.type) for arg in fdef.signature.input_arg)
   func_name = fdef.signature.name
   # Note: FunctionDefs do not include python gradient functions, so if the
   # original _DefinedFunction included one it will not be reflected here.
@@ -675,8 +682,7 @@ def _from_definition(fdef, grad_func=None):
   result._extra_inputs = []
   result._hash_str = result._create_hash_str(
       result._definition.signature.input_arg,
-      result._definition.signature.output_arg,
-      result._definition.node_def)
+      result._definition.signature.output_arg, result._definition.node_def)
   # pylint: enable=protected-access
   return result
 
@@ -696,7 +702,8 @@ def _from_library(lib):
   Raises:
     ValueError: `lib` is invalid
   """
-  if not lib.function and not lib.gradient: return []
+  if not lib.function and not lib.gradient:
+    return []
 
   # function name -> FunctionDef proto
   funcs = {fdef.signature.name: fdef for fdef in lib.function}
@@ -720,8 +727,9 @@ def _from_library(lib):
     grad_to_funcs[gdef.gradient_func].append(gdef.function_name)
 
   # Start with functions without gradients
-  ready = [fdef for fdef in lib.function
-           if func_to_grad[fdef.signature.name] is None]
+  ready = [
+      fdef for fdef in lib.function if func_to_grad[fdef.signature.name] is None
+  ]
   if not ready:
     raise ValueError("FunctionDefLibrary contains cyclic gradient functions!\n"
                      + str(lib))
@@ -733,7 +741,8 @@ def _from_library(lib):
     name = fdef.signature.name
 
     grad = initialized.get(func_to_grad[name])
-    if func_to_grad[name]: assert grad
+    if func_to_grad[name]:
+      assert grad
     defined_func = _from_definition(fdef, grad_func=grad)
     initialized[name] = defined_func
 
@@ -835,10 +844,15 @@ class _OverloadedFunction(object):
       name = self._func_name
       if name is not None:
         name = "_".join([name, key])
-      defined = _DefinedFunction(self._func, self._argnames, input_types, name,
-                                 None, self._python_grad_func,
-                                 out_names=self._out_names,
-                                 **self._extra_kwargs)
+      defined = _DefinedFunction(
+          self._func,
+          self._argnames,
+          input_types,
+          name,
+          None,
+          self._python_grad_func,
+          out_names=self._out_names,
+          **self._extra_kwargs)
       _ = defined.name  # Fully instantiate the function definition.
       if self._grad_func:
         # If _grad_func is given, it is another
@@ -849,8 +863,8 @@ class _OverloadedFunction(object):
             for _ in defined.definition.signature.output_arg
         ]
         # pylint: disable=protected-access
-        defined._grad_func = self._grad_func.instantiate(input_types +
-                                                         output_types)
+        defined._grad_func = self._grad_func.instantiate(
+            input_types + output_types)
         # pylint: enable=protected-access
       self._overload[key] = defined
     return defined
@@ -981,22 +995,36 @@ class Defun(object):
         raise ValueError(
             "The function has fewer arguments than the number of specified "
             "input types.")
-      return _DefinedFunction(func, argnames, self._input_types,
-                              self._func_name, self._grad_func,
-                              self._python_grad_func,
-                              out_names=self._out_names, **self._extra_kwargs)
+      return _DefinedFunction(
+          func,
+          argnames,
+          self._input_types,
+          self._func_name,
+          self._grad_func,
+          self._python_grad_func,
+          out_names=self._out_names,
+          **self._extra_kwargs)
 
     # 'func' expects no arguments and input types is an empty list.
     if min_args == 0 and max_args == 0:
-      return _DefinedFunction(func, [], [], self._func_name, self._grad_func,
-                              self._python_grad_func,
-                              out_names=self._out_names, **self._extra_kwargs)
+      return _DefinedFunction(
+          func, [], [],
+          self._func_name,
+          self._grad_func,
+          self._python_grad_func,
+          out_names=self._out_names,
+          **self._extra_kwargs)
 
     # Input types are unknown. It's an overloaded function and hence
     # its definition needs to be deferred until it's called.
-    return _OverloadedFunction(func, argnames, self._func_name, self._grad_func,
-                               self._python_grad_func,
-                               out_names=self._out_names, **self._extra_kwargs)
+    return _OverloadedFunction(
+        func,
+        argnames,
+        self._func_name,
+        self._grad_func,
+        self._python_grad_func,
+        out_names=self._out_names,
+        **self._extra_kwargs)
 
 
 class Declare(object):
@@ -1039,8 +1067,10 @@ class Declare(object):
       names = [n for n, t in args]
       if len(names) != len(set(names)):
         raise ValueError("Expected names to all be unique: %s" % str(names))
-      return [op_def_pb2.OpDef.ArgDef(type=t.as_datatype_enum, name=n)
-              for n, t in args]
+      return [
+          op_def_pb2.OpDef.ArgDef(type=t.as_datatype_enum, name=n)
+          for n, t in args
+      ]
 
     self._sig.input_arg.extend(_to_argdef_list(inputs))
     self._sig.output_arg.extend(_to_argdef_list(outputs))
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 39f00e52169a7e23d65fb96a767a519a69900acf..416ab263afc685aef50fd22a8cdff6c754f94d8b 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -324,6 +324,48 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
+  def testControlFlowStrictness(self):
+    """Inlined functions must not execute in a untaken control flow branch."""
+
+    @function.Defun(dtypes.int32)
+    def AssertFail(x):
+      # Assertion that always fails and does not have a data dependency on `x`.
+      assert_false = control_flow_ops.Assert(False, [42])
+      with ops.control_dependencies([assert_false]):
+        return array_ops.identity(x)
+
+    with ops.device("CPU"):
+      pred = array_ops.placeholder(dtypes.bool)
+      x = array_ops.placeholder(dtypes.int32)
+      cond = control_flow_ops.cond(pred, lambda: x + 1, lambda: AssertFail(x))
+      # pylint: disable=unnecessary-lambda
+      loop = control_flow_ops.while_loop(lambda y: pred,
+                                         lambda y: AssertFail(y), [x])
+      # pylint: enable=unnecessary-lambda
+
+    # Enables inlining.
+    config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
+        optimizer_options=config_pb2.OptimizerOptions(
+            opt_level=config_pb2.OptimizerOptions.L0,
+            do_common_subexpression_elimination=True,
+            do_function_inlining=True,
+            do_constant_folding=True)))
+
+    with session.Session(config=config) as sess:
+      # Since the 'False' branch is not taken, the assertion should not fire.
+      self.assertEqual(4, sess.run(cond, {pred: True, x: 3}))
+
+      # The assertion should still fire if the False branch is taken.
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "assertion"):
+        sess.run(cond, {pred: False, x: 3})
+
+      # Similarly for loops.
+      self.assertEqual(3, sess.run(loop, {pred: False, x: 3}))
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "assertion"):
+        sess.run(loop, {pred: True, x: 3})
+
   def testVar(self):
 
     @function.Defun(dtypes.float32)
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index fcddd9546d94e0a3e5aa76daef5819b8143eebec..ed579224d32562be6ec64c51c49bce2862f928be 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -275,6 +275,9 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
 
     # 1. Add operations without their inputs.
     for node in graph_def.node:
+      # Check to see if this op's name matches a previously seen op
+      if node.name in name_to_op:
+        raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
       # Set any default attr values that aren't present.
       if node.op not in op_dict:
         raise ValueError('No op named %s in defined operations.' % node.op)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index c4ccc3d1892c4991648be0d03af1191f95c94096..2b2398f83329b373f9d6363be036ae99597c76a2 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -685,6 +685,17 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual("return_elements must be a list of strings.",
                        str(e.exception))
 
+  def testDuplicateOperationNames(self):
+    with ops.Graph().as_default():
+      with self.assertRaises(ValueError) as e:
+        importer.import_graph_def(
+            self._MakeGraphDef("""
+            node { name: 'A' op: 'Oi' }
+            node { name: 'B' op: 'Oi' }
+            node { name: 'A' op: 'Oi' }
+            """))
+      self.assertEqual("Duplicate name 'A' in GraphDef.", str(e.exception))
+
   def testWithExtensionAndAttr(self):
     with ops.Graph().as_default() as g:
       c = constant_op.constant(5.0, dtype=dtypes.float32, name="c")
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 26344d38528c0f937ca36b6bbeceae478cf2f30c..783612c942fc299dc120382ffe7740258ea008a1 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -422,14 +422,15 @@ def import_scoped_meta_graph(meta_graph_or_file,
                              graph=None,
                              import_scope=None,
                              input_map=None,
-                             unbound_inputs_col_name="unbound_inputs"):
-  """Recreates a`Graph` saved in a `MetaGraphDef` proto.
+                             unbound_inputs_col_name="unbound_inputs",
+                             restore_collections_predicate=(lambda key: True)):
+  """Recreates a `Graph` saved in a `MetaGraphDef` proto.
 
   This function takes a `MetaGraphDef` protocol buffer as input. If
   the argument is a file containing a `MetaGraphDef` protocol buffer ,
   it constructs a protocol buffer from the file content. The function
   then adds all the nodes from the `graph_def` field to the
-  current graph, recreates all the collections, and returns a saver
+  current graph, recreates the desired collections, and returns a saver
   constructed from the `saver_def` field.
 
   In combination with `export_scoped_meta_graph()`, this function can be used to
@@ -453,6 +454,10 @@ def import_scoped_meta_graph(meta_graph_or_file,
       `Tensor` objects. The values of the named input tensors in the imported
       graph will be re-mapped to the respective `Tensor` values.
     unbound_inputs_col_name: Collection name for looking up unbound inputs.
+    restore_collections_predicate: a predicate on collection names. A collection
+      named c (i.e whose key is c) will be restored iff
+      1) `restore_collections_predicate(c)` is True, and
+      2) `c != unbound_inputs_col_name`.
 
   Returns:
     A dictionary of all the `Variables` imported into the name scope.
@@ -498,11 +503,16 @@ def import_scoped_meta_graph(meta_graph_or_file,
         input_graph_def, name=(import_scope or ""), input_map=input_map,
         producer_op_list=producer_op_list)
 
+    scope_to_prepend_to_names = "/".join(
+        [part for part in [graph.get_name_scope(), import_scope] if part])
+
     # Restores all the other collections.
     for key, col_def in meta_graph_def.collection_def.items():
       # Don't add unbound_inputs to the new graph.
       if key == unbound_inputs_col_name:
         continue
+      if not restore_collections_predicate(key):
+        continue
 
       kind = col_def.WhichOneof("kind")
       if kind is None:
@@ -517,13 +527,13 @@ def import_scoped_meta_graph(meta_graph_or_file,
           proto = proto_type()
           proto.ParseFromString(value)
           graph.add_to_collection(
-              key, from_proto(proto, import_scope=import_scope))
+              key, from_proto(proto, import_scope=scope_to_prepend_to_names))
       else:
         field = getattr(col_def, kind)
         if kind == "node_list":
           for value in field.value:
             col_op = graph.as_graph_element(
-                ops.prepend_name_scope(value, import_scope))
+                ops.prepend_name_scope(value, scope_to_prepend_to_names))
             graph.add_to_collection(key, col_op)
         elif kind == "int64_list":
           # NOTE(opensource): This force conversion is to work around the fact
@@ -534,13 +544,13 @@ def import_scoped_meta_graph(meta_graph_or_file,
         else:
           for value in field.value:
             graph.add_to_collection(
-                key, ops.prepend_name_scope(value, import_scope))
+                key, ops.prepend_name_scope(value, scope_to_prepend_to_names))
 
     var_list = {}
     variables = graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                                     scope=import_scope)
+                                     scope=scope_to_prepend_to_names)
     for v in variables:
-      var_list[ops.strip_name_scope(v.name, import_scope)] = v
+      var_list[ops.strip_name_scope(v.name, scope_to_prepend_to_names)] = v
 
   return var_list
 
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index f8056ade3e4f45b82cceed5a3e5e47d14b20e7e3..10236576eafe296f0a20235688778a2e6c7a4495 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -335,6 +335,81 @@ class ScopedMetaGraphTest(test.TestCase):
     for a, b in zip(orig_meta_graphs, new_meta_graphs):
       test_util.assert_meta_graph_protos_equal(self, a, b)
 
+  def testScopedImportUnderNameScope(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      variables.Variable(initial_value=1.0, trainable=True, name="myvar")
+    meta_graph_def, _ = meta_graph.export_scoped_meta_graph(graph=graph)
+
+    graph = ops.Graph()
+    with graph.as_default():
+      with ops.name_scope("foo"):
+        imported_variables = meta_graph.import_scoped_meta_graph(
+            meta_graph_def, import_scope="bar")
+        self.assertEqual(len(imported_variables), 1)
+        self.assertEqual(list(imported_variables.values())[0].name,
+                         "foo/bar/myvar:0")
+
+  def testScopedImportWithSelectedCollections(self):
+    meta_graph_filename = os.path.join(
+        _TestDir("selected_collections_import"), "meta_graph.pb")
+
+    graph = ops.Graph()
+    # Add a variable to populate two collections. The functionality tested is
+    # not specific to variables, but using variables in the test is convenient.
+    with graph.as_default():
+      variables.Variable(initial_value=1.0, trainable=True)
+    self.assertTrue(
+        all([
+            graph.get_collection(key)
+            for key in
+            [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES]
+        ]))
+    meta_graph.export_scoped_meta_graph(
+        filename=meta_graph_filename, graph=graph)
+
+    def _test_import(include_collection_keys, omit_collection_keys):
+      assert set(include_collection_keys).isdisjoint(omit_collection_keys)
+      newgraph = ops.Graph()
+      import_scope = "some_scope_name"
+
+      def _restore_collections_predicate(collection_key):
+        return (collection_key in include_collection_keys and
+                collection_key not in omit_collection_keys)
+
+      meta_graph.import_scoped_meta_graph(
+          meta_graph_filename,
+          graph=newgraph,
+          import_scope=import_scope,
+          restore_collections_predicate=_restore_collections_predicate)
+      collection_values = [
+          newgraph.get_collection(name=key, scope=import_scope)
+          for key in include_collection_keys
+      ]
+      self.assertTrue(all(collection_values))
+      collection_values = [
+          newgraph.get_collection(name=key, scope=import_scope)
+          for key in omit_collection_keys
+      ]
+      self.assertFalse(any(collection_values))
+
+    _test_import(
+        include_collection_keys=[
+            ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES
+        ],
+        omit_collection_keys=[])
+    _test_import(
+        include_collection_keys=[ops.GraphKeys.GLOBAL_VARIABLES],
+        omit_collection_keys=[ops.GraphKeys.TRAINABLE_VARIABLES])
+    _test_import(
+        include_collection_keys=[ops.GraphKeys.TRAINABLE_VARIABLES],
+        omit_collection_keys=[ops.GraphKeys.GLOBAL_VARIABLES])
+    _test_import(
+        include_collection_keys=[],
+        omit_collection_keys=[
+            ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES
+        ])
+
   def _testScopedExportWithQueue(self, test_dir, exported_filename):
     graph = ops.Graph()
     with graph.as_default():
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 2c39f5b0e37223a14e47dd23a58490e6075a6695..662c2c679c8113cbba3ba4bbfcd6c587fb0fbc2c 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -328,7 +328,7 @@ class OpDefLibrary(object):
       # Need to flatten all the arguments into a list.
       # pylint: disable=protected-access
       g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
-      # pyline: enable=protected-access
+      # pylint: enable=protected-access
     except AssertionError as e:
       raise RuntimeError(
           "Cannot determine graph for Op '%s' due to: %s"
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 410c27b621f04555099ea1d9f54411b448683176..c481726429ed2c567f0eef6212350e87ea2a529e 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -34,8 +34,10 @@ from tensorflow.core.framework import node_def_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor_shape
@@ -46,6 +48,24 @@ from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import tf_contextlib
 
 
+# Temporary global switch determining if we should enable the work-in-progress
+# calls to the C API. Currently disabled by default but can be manually enabled
+# e.g. in tests. This will be removed once all functionality is supported and
+# there's no performance penalty with it enabled.
+#
+# TODO(skyewm) before we can remove this:
+# - functions
+# - import_graph_def() incrementally adds inputs to ops (i.e. creates an
+#   Operation and then calls _add_input()). The current code requires that all
+#   inputs be specified when creating the Operation (since we call
+#   TF_FinishOperation()).
+# - ops_test.py (and others?) create unregistered op types
+# - while loop
+# - performance (e.g. delete/refactor redundant Python functionality, switch to
+#   new session API)
+_USE_C_API = False
+
+
 def _override_helper(clazz_object, operator, func):
   """Overrides (string) operator on Tensors to call func.
 
@@ -70,25 +90,33 @@ def _override_helper(clazz_object, operator, func):
   setattr(clazz_object, operator, func)
 
 
-def _convert_stack(stack):
+def _convert_stack(stack, include_func_start_lineno=False):
   """Converts a stack extracted using _extract_stack() to a traceback stack.
 
   Args:
-    stack: A list of n 4-tuples, (filename, lineno, name, frame_globals).
+    stack: A list of n 5-tuples,
+      (filename, lineno, name, frame_globals, func_start_lineno).
+    include_func_start_lineno: True if function start line number should be
+      included as the 5th entry in return tuples.
 
   Returns:
-    A list of n 4-tuples (filename, lineno, name, code), where the code tuple
-    element is calculated from the corresponding elements of the input tuple.
+    A list of n 4-tuples or 5-tuples
+    (filename, lineno, name, code, [optional: func_start_lineno]), where the
+    code tuple element is calculated from the corresponding elements of the
+    input tuple.
   """
   ret = []
-  for filename, lineno, name, frame_globals in stack:
+  for filename, lineno, name, frame_globals, func_start_lineno in stack:
     linecache.checkcache(filename)
     line = linecache.getline(filename, lineno, frame_globals)
     if line:
       line = line.strip()
     else:
       line = None
-    ret.append((filename, lineno, name, line))
+    if include_func_start_lineno:
+      ret.append((filename, lineno, name, line, func_start_lineno))
+    else:
+      ret.append((filename, lineno, name, line))
   return ret
 
 
@@ -103,7 +131,8 @@ def _extract_stack():
     be formatted etc. using traceback methods.
 
   Returns:
-    A list of 4-tuples (filename, lineno, name, frame_globals) corresponding to
+    A list of 5-tuples
+    (filename, lineno, name, frame_globals, func_start_lineno) corresponding to
     the call stack of the current thread.
   """
   # pylint: enable=line-too-long
@@ -118,7 +147,8 @@ def _extract_stack():
     filename = co.co_filename
     name = co.co_name
     frame_globals = f.f_globals
-    ret.append((filename, lineno, name, frame_globals))
+    func_start_lineno = co.co_firstlineno
+    ret.append((filename, lineno, name, frame_globals, func_start_lineno))
     f = f.f_back
   ret.reverse()
   return ret
@@ -457,6 +487,13 @@ class Tensor(_TensorLike):
     else:
       return "%s:%d" % (self._op.name, self._value_index)
 
+  def _as_tf_output(self):
+    assert self.op._c_op  # pylint: disable=protected-access
+    tf_output = c_api.TF_Output()
+    tf_output.oper = self.op._c_op  # pylint: disable=protected-access
+    tf_output.index = self.value_index
+    return tf_output
+
   def __str__(self):
     return "Tensor(\"%s\"%s%s%s)" % (
         self.name,
@@ -1242,6 +1279,104 @@ class Operation(object):
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._recompute_node_def()
 
+    if _USE_C_API:
+      assert self._graph._c_graph, (  # pylint: disable=protected-access
+          "_USE_C_API set to False when creating Graph, you may need to "
+          "manually set 'ops._USE_C_API = True' before creating the Graph")
+      if self._op_def:
+        # TODO(skyewm): op_def_library.apply_op() flattens the incoming
+        # inputs. Refactor so we don't have to do this here.
+        grouped_inputs = self._reconstruct_sequence_inputs(
+            self._op_def, self._inputs, self._node_def.attr)
+      else:
+        # If no OpDef is specified, assume all inputs are scalar.
+        grouped_inputs = self._inputs
+
+      self._c_op = self._create_c_op(self._graph, self._node_def,
+                                     grouped_inputs, self._control_inputs)
+    else:
+      self._c_op = None
+
+  def _create_c_op(self, graph, node_def, inputs, control_inputs):
+    """Creates a TF_Operation.
+
+    Arguments:
+      graph: a `Graph`.
+      node_def: `node_def_pb2.NodeDef` for the operation to create.
+      inputs: A list of `Tensor`s (corresponding to scalar inputs) and lists of
+        `Tensor`s (corresponding to sequence inputs, e.g. "int64 * N",
+        "list(int64)"). The length of the list should be equal to the number of
+        inputs specified by this operation's op def.
+      control_inputs: A list of `Operation`s to set as control dependencies.
+
+    Returns:
+      A wrapped TF_Operation*.
+    """
+    # pylint: disable=protected-access
+    op_desc = c_api.TF_NewOperation(graph._c_graph.g,
+                                    compat.as_str(node_def.op),
+                                    compat.as_str(node_def.name))
+    # Add inputs
+    for op_input in inputs:
+      if isinstance(op_input, (list, tuple)):
+        c_api.TF_AddInputList(op_desc, [t._as_tf_output() for t in op_input])
+      else:
+        c_api.TF_AddInput(op_desc, op_input._as_tf_output())
+
+    # Add control inputs
+    for control_input in control_inputs:
+      c_api.TF_AddControlInput(op_desc, control_input._c_op)
+    # pylint: enable=protected-access
+
+    # Add attrs
+    for name, attr_value in node_def.attr.items():
+      serialized = attr_value.SerializeToString()
+      # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
+      # It might be worth creating a convenient way to re-use the same status.
+      with errors.raise_exception_on_not_ok_status() as status:
+        c_api.TF_SetAttrValueProto(op_desc, compat.as_str(name), serialized,
+                                   status)
+
+    with errors.raise_exception_on_not_ok_status() as status:
+      c_op = c_api.TF_FinishOperation(op_desc, status)
+
+    return c_op
+
+  def _reconstruct_sequence_inputs(self, op_def, inputs, attrs):
+    """Regroups a flat list of input tensors into scalar and sequence inputs.
+
+    Arguments:
+      op_def: The `op_def_pb2.OpDef` (for knowing the input types)
+      inputs: a list of input `Tensor`s to the op.
+      attrs: mapping from attr name to `attr_value_pb2.AttrValue` (these define
+        how long each sequence is)
+
+    Returns:
+      A list of `Tensor`s (corresponding to scalar inputs) and lists of
+      `Tensor`s (corresponding to sequence inputs).
+    """
+    grouped_inputs = []
+    i = 0
+    for input_arg in op_def.input_arg:
+      if input_arg.number_attr:
+        input_len = attrs[input_arg.number_attr].i
+        is_sequence = True
+      elif input_arg.type_list_attr:
+        input_len = len(attrs[input_arg.type_list_attr].list.type)
+        is_sequence = True
+      else:
+        input_len = 1
+        is_sequence = False
+
+      if is_sequence:
+        grouped_inputs.append(inputs[i:i + input_len])
+      else:
+        grouped_inputs.append(inputs[i])
+      i += input_len
+
+    assert i == len(inputs)
+    return grouped_inputs
+
   def colocation_groups(self):
     """Returns the list of colocation groups of the op."""
     default_colocation_group = [compat.as_bytes("loc:@%s" %
@@ -1505,6 +1640,15 @@ class Operation(object):
     """Returns the call stack from when this operation was constructed."""
     return _convert_stack(self._traceback)
 
+  @property
+  def traceback_with_start_lines(self):
+    """Same as traceback but includes start line of function definition.
+
+    Returns:
+      A list of 5-tuples (filename, lineno, name, code, func_start_lineno).
+    """
+    return _convert_stack(self._traceback, include_func_start_lineno=True)
+
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
 
@@ -1528,12 +1672,18 @@ class Operation(object):
     if x.HasField("list"):
       for f in fields:
         if getattr(x.list, f):
-          return list(getattr(x.list, f))
+          if f == "type":
+            return [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
+          else:
+            return list(getattr(x.list, f))
       return []
     else:
       for f in fields:
         if x.HasField(f):
-          return getattr(x, f)
+          if f == "type":
+            return dtypes.as_dtype(getattr(x, f))
+          else:
+            return getattr(x, f)
       assert False, "Unsupported field type in " + str(x)
 
   def run(self, feed_dict=None, session=None):
@@ -1892,6 +2042,15 @@ def _name_from_scope_name(name):
   return name[:-1] if name[-1] == "/" else name
 
 
+class _ScopedTF_Graph(object):
+
+  def __init__(self):
+    self.g = c_api.TF_NewGraph()
+
+  def __del__(self):
+    c_api.TF_DeleteGraph(self.g)
+
+
 class Graph(object):
   """A TensorFlow computation, represented as a dataflow graph.
 
@@ -2005,6 +2164,13 @@ class Graph(object):
     self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
 
+    # TODO(skyewm): fold as much of the above as possible into the C
+    # implementation
+    if _USE_C_API:
+      self._c_graph = _ScopedTF_Graph()
+    else:
+      self._c_graph = None
+
   def _check_not_finalized(self):
     """Check if the graph is finalized.
 
@@ -3981,9 +4147,13 @@ class GraphKeys(object):
     for more details.
   * `REGULARIZATION_LOSSES`: regularization losses collected during graph
     construction.
-  * `WEIGHTS`: weights inside neural network layers
-  * `BIASES`: biases inside neural network layers
-  * `ACTIVATIONS`: activations of neural network layers
+
+  The following standard keys are _defined_, but their collections are **not**
+  automatically populated as many of the others are:
+
+  * `WEIGHTS`
+  * `BIASES`
+  * `ACTIVATIONS`
   """
 
   # Key to collect Variable objects that are global (shared across machines).
@@ -4203,10 +4373,15 @@ def strip_name_scope(name, export_scope):
     is None.
   """
   if export_scope:
-    # Strips export_scope/, export_scope///,
-    # ^export_scope/, loc:@export_scope/.
-    str_to_replace = r"([\^]|loc:@|^)" + export_scope + r"[\/]+(.*)"
-    return re.sub(str_to_replace, r"\1\2", compat.as_str(name), count=1)
+    try:
+      # Strips export_scope/, export_scope///,
+      # ^export_scope/, loc:@export_scope/.
+      str_to_replace = r"([\^]|loc:@|^)" + export_scope + r"[\/]+(.*)"
+      return re.sub(str_to_replace, r"\1\2", compat.as_str(name), count=1)
+    except TypeError as e:
+      # If the name is not of a type we can process, simply return it.
+      logging.warning(e)
+      return name
   else:
     return name
 
@@ -4223,9 +4398,14 @@ def prepend_name_scope(name, import_scope):
     is None.
   """
   if import_scope:
-    str_to_replace = r"([\^]|loc:@|^)(.*)"
-    return re.sub(str_to_replace, r"\1" + import_scope + r"/\2",
-                  compat.as_str(name))
+    try:
+      str_to_replace = r"([\^]|loc:@|^)(.*)"
+      return re.sub(str_to_replace, r"\1" + import_scope + r"/\2",
+                    compat.as_str(name))
+    except TypeError as e:
+      # If the name is not of a type we can process, simply return it.
+      logging.warning(e)
+      return name
   else:
     return name
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 2cff66dfb7453851fdb8430a4b40fd5ca1742177..32d9d52d00cf5e250dd3eac66bd53910ff19290a 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -18,7 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
+import weakref
+
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
@@ -31,6 +37,7 @@ from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_ops_2
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resources
@@ -351,6 +358,32 @@ class OperationTest(test_util.TensorFlowTestCase):
         ops._NodeDef("noop", "op1"), ops.Graph(), [], [dtypes.float32])
     self.assertEqual("<tf.Operation 'op1' type=noop>", repr(op))
 
+  def testGetAttr(self):
+    list_value = attr_value_pb2.AttrValue.ListValue()
+    list_value.type.append(types_pb2.DT_STRING)
+    list_value.type.append(types_pb2.DT_DOUBLE)
+    op = ops.Operation(
+        ops._NodeDef(
+            "noop",
+            "op1",
+            attrs={
+                "value": attr_value_pb2.AttrValue(i=32),
+                "dtype": attr_value_pb2.AttrValue(type=types_pb2.DT_INT32),
+                "list": attr_value_pb2.AttrValue(list=list_value)
+            }), ops.Graph(), [], [dtypes.int32])
+    self.assertEqual(32, op.get_attr("value"))
+
+    d = op.get_attr("dtype")
+    # First check that d is a DType, because the assertEquals will
+    # work no matter what since DType overrides __eq__
+    self.assertIsInstance(d, dtypes.DType)
+    self.assertEqual(dtypes.int32, d)
+
+    l = op.get_attr("list")
+    for x in l:
+      self.assertIsInstance(x, dtypes.DType)
+    self.assertEqual([dtypes.string, dtypes.double], l)
+
 
 class CreateOpTest(test_util.TensorFlowTestCase):
 
@@ -1021,18 +1054,28 @@ class ComparisonTest(test_util.TensorFlowTestCase):
 class ControlDependenciesTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
-    g = ops.Graph()
-    a = _apply_op(g, "const", [], [dtypes.float32])
-    b = _apply_op(g, "const", [], [dtypes.float32])
-    with g.control_dependencies([a]):
-      c = _apply_op(g, "const", [], [dtypes.float32])
-      d = _apply_op(g, "identity", [b], [dtypes.float32])
-      e = _apply_op(g, "identity", [c], [dtypes.float32])
-
-    self.assertEqual(c.op.control_inputs, [a.op])
-    self.assertEqual(d.op.control_inputs, [a.op])
-    # e should be dominated by c.
-    self.assertEqual(e.op.control_inputs, [])
+    ops._USE_C_API = True
+    try:
+      g = ops.Graph()
+      with g.as_default():
+        # Creating unregistered ops with _apply_op() doesn't work with the C API
+        # TODO(skyewm): address this more consistently. Possible solutions are
+        # to use registered ops in all tests, create a way to register ops in
+        # Python tests, or conditionally disable the op registration check in
+        # the C API.
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(1.0)
+        with g.control_dependencies([a]):
+          c = constant_op.constant(1.0)
+          d = array_ops.identity(b)
+          e = array_ops.identity(c)
+
+      self.assertEqual(c.op.control_inputs, [a.op])
+      self.assertEqual(d.op.control_inputs, [a.op])
+      # e should be dominated by c.
+      self.assertEqual(e.op.control_inputs, [])
+    finally:
+      ops._USE_C_API = False
 
   def testBasicWithConversion(self):
     g = ops.Graph()
@@ -1298,6 +1341,32 @@ class GraphTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError):
       g.as_graph_element(NonConvertibleObj())
 
+  # Regression test against creating custom __del__ functions in classes
+  # involved in cyclic references, e.g. Graph and Operation. (Python won't gc
+  # cycles that require calling a __del__ method, because the __del__ method can
+  # theoretically increase the object's refcount to "save" it from gc, and any
+  # already-deleted objects in the cycle would have be to restored.)
+  def testGarbageCollected(self):
+    # Create a graph we can delete and a weak reference to monitor if it's gc'd
+    g = ops.Graph()
+    g_ref = weakref.ref(g)
+    # Create some ops
+    with g.as_default():
+      a = constant_op.constant(2.0)
+      b = constant_op.constant(3.0)
+      c = math_ops.add(a, b)
+    # Create a session we can delete
+    with session.Session(graph=g) as sess:
+      sess.run(c)
+    # Delete all references and trigger gc
+    del g
+    del a
+    del b
+    del c
+    del sess
+    gc.collect()
+    self.assertIsNone(g_ref())
+
 
 class AttrScopeTest(test_util.TensorFlowTestCase):
 
@@ -1673,5 +1742,26 @@ class NameScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual("", g.get_name_scope())
 
 
+class TracebackTest(test_util.TensorFlowTestCase):
+
+  def testTracebackWithStartLines(self):
+    with self.test_session() as sess:
+      a = constant_op.constant(2.0)
+      sess.run(
+          a,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(sess.graph.get_operations())
+
+      # Tests that traceback_with_start_lines is the same as traceback
+      # but includes one more element at the end.
+      for op in sess.graph.get_operations():
+        self.assertEquals(len(op.traceback), len(op.traceback_with_start_lines))
+        for frame, frame_with_start_line in zip(
+            op.traceback, op.traceback_with_start_lines):
+          self.assertEquals(5, len(frame_with_start_line))
+          self.assertEquals(frame, frame_with_start_line[:-1])
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 64be2c70a11df09b96a5818445a440bcb2fd5286..a3168a008834bd9144b21e852e04d42bf3afcd90 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -21,8 +21,11 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor.pb_text.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -66,7 +69,8 @@ bool IsPythonReserved(const string& s) {
        "ZeroDivisionError", "__debug__", "__doc__", "__import__", "__name__",
        "__package__",
        // Imports and symbols used in the generated code:
-       "_op_def_lib", "text_format", "op_def_pb2", "op_def_library", "ops"});
+       "_text_format", "_op_def_pb2", "_common_shapes", "_op_def_registry",
+       "_ops", "_op_def_library"});
 
   return kPythonReserved->count(s) > 0;
 }
@@ -175,13 +179,12 @@ string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
         prefix = "A list of";
       }
     } else {
-      prefix = strings::StrCat(
-          "A list with the same number of `Tensor` objects as `",
-          AvoidPythonReserved(*original_arg), "` of");
+      prefix = strings::StrCat("A list with the same length as `",
+                               AvoidPythonReserved(*original_arg), "` of");
     }
 
     if (arg.type() != DT_INVALID) {
-      return strings::StrCat(prefix, " `Tensor` objects of type ",
+      return strings::StrCat(prefix, " `Tensor` objects with type ",
                              TypeString(arg.type(), arg.is_ref()), ".");
     } else {
       original_arg = gtl::FindOrNull(inferred_attrs, arg.type_attr());
@@ -189,20 +192,22 @@ string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
         strings::StrAppend(&prefix, " mutable");
       }
       if (original_arg == nullptr) {
-        return strings::StrCat(prefix, " `Tensor` objects of type ",
-                               arg.type_attr(), ".");
+        return strings::StrCat(prefix, " `Tensor` objects with type `",
+                               arg.type_attr(), "`.");
       } else if (*original_arg == arg.name()) {
         const OpDef::AttrDef* attr = FindAttr(arg.type_attr(), op_def);
         if (attr->has_allowed_values()) {
           return strings::StrCat(prefix,
-                                 " `Tensor` objects of the same type in: ",
+                                 " `Tensor` objects with the same type in: ",
                                  TypeListString(attr->allowed_values()), ".");
         } else {
-          return strings::StrCat(prefix, " `Tensor` objects of the same type.");
+          return strings::StrCat(prefix,
+                                 " `Tensor` objects with the same type.");
         }
       } else {
-        return strings::StrCat(prefix, " `Tensor` objects of the same type as ",
-                               AvoidPythonReserved(*original_arg), ".");
+        return strings::StrCat(prefix,
+                               " `Tensor` objects with the same type as `",
+                               AvoidPythonReserved(*original_arg), "`.");
       }
     }
   } else if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) {
@@ -241,19 +246,19 @@ string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
   }
 }
 
-static string GetReturns(const OpDef& op_def,
-                         const std::vector<string>& output_type_string) {
+string GetReturns(const OpDef& op_def,
+                  const std::vector<string>& output_type_string) {
   string result;
   DCHECK_EQ(op_def.output_arg_size(), output_type_string.size());
   const int num_outs = op_def.output_arg_size();
-  strings::Appendf(&result, "\n  Returns:\n");
+  strings::StrAppend(&result, "\n  Returns:\n");
   if (num_outs == 0) {
-    strings::Appendf(&result, "    The created Operation.\n");
+    strings::StrAppend(&result, "    The created Operation.\n");
   } else {
     if (num_outs == 1) {
       StringPiece description = op_def.output_arg(0).description();
       if (ConsumeEquals(&description)) {  // Skip the generated type info.
-        strings::Appendf(&result, "%s", Indent(4, 4, description).c_str());
+        strings::StrAppend(&result, Indent(4, 4, description));
       } else {
         // Special case of one output, don't use the name of the output unless
         // there is no description.
@@ -272,7 +277,7 @@ static string GetReturns(const OpDef& op_def,
         } else if (!description.empty()) {
           AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
         }
-        strings::Appendf(&result, "%s", Indent(4, 4, desc).c_str());
+        strings::StrAppend(&result, Indent(4, 4, desc));
       }
     } else {
       std::vector<string> out_names(num_outs);
@@ -283,8 +288,8 @@ static string GetReturns(const OpDef& op_def,
           out_names[i] = strings::StrCat("output", i);
         }
       }
-      strings::Appendf(&result, "    A tuple of `Tensor` objects (%s).\n",
-                       str_util::Join(out_names, ", ").c_str());
+      strings::StrAppend(&result, "    A tuple of `Tensor` objects (",
+                         str_util::Join(out_names, ", "), ").\n\n");
       for (int i = 0; i < num_outs; ++i) {
         string desc = strings::StrCat(out_names[i], ": ");
         StringPiece description = op_def.output_arg(i).description();
@@ -307,7 +312,7 @@ static string GetReturns(const OpDef& op_def,
             strings::StrAppend(&desc, type);
           }
         }
-        strings::Appendf(&result, "%s", Indent(4, 6, desc).c_str());
+        strings::StrAppend(&result, Indent(4, 6, desc));
       }
     }
   }
@@ -337,6 +342,10 @@ string ShapeToPython(const TensorShapeProto& shape) {
   return python;
 }
 
+string TensorToPython(const TensorProto& proto) {
+  return ProtoShortDebugString(proto);
+}
+
 string AttrListToPython(const AttrValue& value) {
   string ret;
   if (value.list().s_size() > 0) {
@@ -369,6 +378,16 @@ string AttrListToPython(const AttrValue& value) {
       if (i > 0) strings::StrAppend(&ret, ", ");
       strings::StrAppend(&ret, ShapeToPython(value.list().shape(i)));
     }
+  } else if (value.list().tensor_size() > 0) {
+    for (int i = 0; i < value.list().tensor_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, TensorToPython(value.list().tensor(i)));
+    }
+  } else if (value.list().func_size() > 0) {
+    for (int i = 0; i < value.list().func_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, StringToPython(value.list().func(i).name()));
+    }
   }
   return ret;
 }
@@ -386,12 +405,36 @@ string AttrValueToPython(const string& type, const AttrValue& value) {
     return DataTypeToPython(value.type());
   } else if (type == "shape") {
     return ShapeToPython(value.shape());
-  } else {
+  } else if (type == "tensor") {
+    return TensorToPython(value.tensor());
+  } else if (type == "func") {
+    return StringToPython(value.func().name());
+  } else if (StringPiece(type).starts_with("list(")) {
     return strings::StrCat("[", AttrListToPython(value), "]");
+  } else {
+    return "?";
   }
 }
 
-static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
+void GenerateLowerCaseOpName(const string& str, string* result) {
+  const char joiner = '_';
+  const int last_index = str.size() - 1;
+  for (int i = 0; i <= last_index; ++i) {
+    const char c = str[i];
+    // Emit a joiner only if a previous-lower-to-now-upper or a
+    // now-upper-to-next-lower transition happens.
+    if (isupper(c) && (i > 0)) {
+      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
+        result->push_back(joiner);
+      }
+    }
+    result->push_back(tolower(c));
+  }
+}
+
+}  // namespace
+
+string GetPythonOp(const OpDef& op_def, bool is_hidden, const string& op_name) {
   string result;
   // Map from attr name to the first input arg it is inferred from.
   std::unordered_map<string, string> inferred_attrs;
@@ -399,7 +442,7 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
   // defaults.
   std::vector<string> args_no_default;
   // The parameters with defaults (these have to be listed after those without).
-  // No input args are included, just attrs and the graph ("g") parameter.
+  // No input args are included, just attrs.
   std::vector<string> args_with_defaults;
   for (int i = 0; i < op_def.input_arg_size(); ++i) {
     const auto& arg(op_def.input_arg(i));
@@ -430,8 +473,7 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
   // those with defaults go at the end.
   std::vector<string> attrs;
   // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of args_no_default, and adding args_no_default (before
-  // "g" gets added to args_no_default, so it only has attrs).
+  // from the end of args_no_default, and adding args_no_default.
   attrs.reserve(args_no_default.size() - op_def.input_arg_size() +
                 args_with_defaults.size());
   attrs.insert(attrs.end(), args_no_default.begin() + op_def.input_arg_size(),
@@ -454,51 +496,51 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
     strings::StrAppend(&parameters, param, "=None");
     param_names.push_back(param);
   }
-  const bool has_args = args_no_default.size() + args_with_defaults.size() > 0;
 
   const string lower_op_name = strings::StrCat(is_hidden ? "_" : "", op_name);
 
-  // Prepare the list of output names
   const int num_outs = op_def.output_arg_size();
-  std::vector<string> out_names(num_outs);
-  for (int i = 0; i < num_outs; ++i) {
-    if (!op_def.output_arg(i).name().empty()) {
-      out_names[i] = op_def.output_arg(i).name();
-    } else {
-      out_names[i] = strings::StrCat("output", i);
-    }
-  }
-  string out_names_list =
-      strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
-
-  // Provide the output names as a Python list
-  string lower_op_name_outputs =
-      strings::StrCat("_", lower_op_name, "_outputs");
-  const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
-  strings::Appendf(
-      &result, "%s\n",
-      WordWrap(outputs_prefix, out_names_list, kRightMargin).c_str());
-  strings::Appendf(&result, "\n\n");
-
   // Prepare a NamedTuple type to hold the outputs, if there are multiple
   if (num_outs > 1) {
-    const string tuple_type_prefix = strings::StrCat(
-        "_", op_def.name(), "Output = _collections.namedtuple(");
+    // Prepare the list of output names
+    std::vector<string> out_names(num_outs);
+    for (int i = 0; i < num_outs; ++i) {
+      if (!op_def.output_arg(i).name().empty()) {
+        out_names[i] = op_def.output_arg(i).name();
+      } else {
+        out_names[i] = strings::StrCat("output", i);
+      }
+    }
+    string out_names_list =
+        strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
+
+    // Provide the output names as a Python list
+    string lower_op_name_outputs =
+        strings::StrCat("_", lower_op_name, "_outputs");
+    const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
+    strings::StrAppend(&result, "\n",
+                       WordWrap(outputs_prefix, out_names_list, kRightMargin),
+                       "\n");
+
+    strings::StrAppend(&result, "_", op_def.name(),
+                       "Output = _collections.namedtuple(\n");
+    const string tuple_type_prefix = "    ";
     const string tuple_type_suffix = strings::StrCat(
         "\"", op_def.name(), "\", ", lower_op_name_outputs, ")");
-    strings::Appendf(
-        &result, "%s\n",
-        WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin).c_str());
-    strings::Appendf(&result, "\n\n");
+    strings::StrAppend(
+        &result, WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin),
+        "\n\n");
   }
+  strings::StrAppend(&result, "\n");
 
   // Print: def Function(parameters):
   const string def_prefix = strings::StrCat("def ", lower_op_name, "(");
+  const bool has_args = args_no_default.size() + args_with_defaults.size() > 0;
   const string def_suffix =
       strings::StrCat(parameters, has_args ? ", " : "", "name=None):");
 
-  strings::Appendf(&result, "%s\n",
-                   WordWrap(def_prefix, def_suffix, kRightMargin).c_str());
+  strings::StrAppend(&result, WordWrap(def_prefix, def_suffix, kRightMargin),
+                     "\n");
 
   // Format the Op's descriptions so that it can be a Python docstring.
   string comment;
@@ -511,7 +553,7 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
     }
   }
 
-  strings::Appendf(&result, "  r\"\"\"%s\n  Args:\n", comment.c_str());
+  strings::StrAppend(&result, "  r\"\"\"", comment, "\n  Args:\n");
 
   // Inputs
   for (int i = 0; i < op_def.input_arg_size(); ++i) {
@@ -527,7 +569,7 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
     if (!description.empty()) {
       AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
     }
-    strings::Appendf(&result, "%s", Indent(4, 6, desc).c_str());
+    strings::StrAppend(&result, Indent(4, 6, desc));
   }
 
   // Attrs
@@ -549,6 +591,10 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
         {"shape", "`tf.TensorShape` or list of `ints`"},
         {"list(shape)",
          "list of shapes (each a `tf.TensorShape` or list of `ints`)"},
+        {"tensor", "`tf.TensorProto`"},
+        {"list(tensor)", "list of `tf.TensorProto` objects"},
+        {"func", "function decorated with @Defun"},
+        {"list(func)", "list of functions decorated with @Defun"},
     };
     for (size_t i = 0; i < TF_ARRAYSIZE(kAttrTypeName); ++i) {
       if (attr.type() == kAttrTypeName[i][0]) {
@@ -592,14 +638,15 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
       AppendWithinWidth(&desc, attr.description(),
                         kRightMargin - 4 /* indent */);
     }
-    strings::Appendf(&result, "%s", Indent(4, 6, desc).c_str());
+    strings::StrAppend(&result, Indent(4, 6, desc));
   }
 
-  strings::Appendf(&result, "    name: A name for the operation (optional).\n");
+  strings::StrAppend(&result,
+                     "    name: A name for the operation (optional).\n");
 
   std::vector<string> output_type_string;
-  output_type_string.reserve(op_def.output_arg_size());
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+  output_type_string.reserve(num_outs);
+  for (int i = 0; i < num_outs; ++i) {
     output_type_string.push_back(
         ArgTypeName(op_def, op_def.output_arg(i), inferred_attrs, true));
   }
@@ -612,46 +659,27 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
   }
   strings::StrAppend(&return_args, "name=name)");
 
-  strings::Appendf(&result, "  \"\"\"\n%s\n",
-                   // Wrap the arguments, and indent to the (.
-                   WordWrap(return_prefix, return_args, kRightMargin).c_str());
+  strings::StrAppend(&result, "  \"\"\"\n",
+                     // Wrap the arguments, and indent to the (.
+                     WordWrap(return_prefix, return_args, kRightMargin), "\n");
 
   if (num_outs <= 1) {
-    strings::Appendf(&result, "  return result\n");
+    strings::StrAppend(&result, "  return result\n");
   } else {
-    string return_tuple =
-        strings::StrCat("  return _", op_def.name(), "Output._make(result)\n");
-    strings::Appendf(&result, "%s", return_tuple.c_str());
+    strings::StrAppend(&result, "  return _", op_def.name(),
+                       "Output._make(result)\n");
   }
+  strings::StrAppend(&result, "\n\n");
 
-  strings::Appendf(&result, "\n\n");
   return result;
 }
 
-void GenerateLowerCaseOpName(const string& str, string* result) {
-  char joiner = '_';
-  int last_index = str.size() - 1;
-  for (int i = 0; i <= last_index; ++i) {
-    char c = str[i];
-    // Emit a joiner only if a previous-lower-to-now-upper or a
-    // now-upper-to-next-lower transition happens.
-    if (isupper(c) && (i > 0)) {
-      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
-        result->push_back(joiner);
-      }
-    }
-    result->push_back(tolower(c));
-  }
-}
-
-}  // namespace
-
 string GetPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
                     bool require_shapes) {
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
-  strings::Appendf(&result, R"("""Python wrappers around Brain.
+  strings::StrAppend(&result, R"("""Python wrappers around TensorFlow ops.
 
 This file is MACHINE GENERATED! Do not edit.
 """
@@ -699,8 +727,8 @@ from tensorflow.python.framework import op_def_library as _op_def_library
                        GetPythonOp(op_def, is_hidden, lower_case_name));
 
     if (!require_shapes) {
-      strings::Appendf(&result, "_ops.RegisterShape(\"%s\")(None)\n",
-                       op_def.name().c_str());
+      strings::StrAppend(&result, "_ops.RegisterShape(\"", op_def.name(),
+                         "\")(None)\n");
     }
 
     auto added = out->Add();
@@ -722,7 +750,7 @@ _InitOpDefLibrary.op_list_ascii = """%s"""
 
 _op_def_lib = _InitOpDefLibrary()
 )",
-                   cleaned_ops.DebugString().c_str());
+                   ProtoDebugString(cleaned_ops).c_str());
   return result;
 }
 
@@ -731,8 +759,8 @@ void PrintPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
   printf("%s", GetPythonOps(ops, hidden_ops, require_shapes).c_str());
 }
 
-string GetPythonWrappers(const char* op_wrapper_buf, size_t op_wrapper_len) {
-  string op_list_str(op_wrapper_buf, op_wrapper_len);
+string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
+  string op_list_str(op_list_buf, op_list_len);
   OpList ops;
   ops.ParseFromString(op_list_str);
   return GetPythonOps(ops, {}, false);
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 424244fcc55006943340ed865e97b9572a14102e..d865c238743ae7b8a5dc5a0101e2f154fca9baed 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -31,11 +31,13 @@ void PrintPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
                     bool require_shapes);
 string GetPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
                     bool require_shapes);
+string GetPythonOp(const OpDef& op_def, bool is_hidden, const string& op_name);
 
 // Get the python wrappers for a list of ops in a OpList.
-// buf should be a pointer to a buffer containing the binary encoded OpList
-// proto, and len should be the length of that buffer.
-string GetPythonWrappers(const char* op_wrapper_buf, size_t op_wrapper_len);
+// `op_list_buf` should be a pointer to a buffer containing
+// the binary encoded OpList proto, and `op_list_len` should be the
+// length of that buffer.
+string GetPythonWrappers(const char* op_list_buf, size_t op_list_len);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index 740eff4ecdb24a933abceeec11e5be1c60037e5b..26ec4e8e66b5d4e3be433c9e59f9b6034109d153 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -25,7 +25,7 @@ limitations under the License.
 // going from python bytes to const char* tries to decode the
 // contents from utf-8 to unicode for Python version >= 3, but
 // we want the bytes to be uninterpreted.
-%typemap(in) (const char* op_wrapper_buf, size_t op_wrapper_len) {
+%typemap(in) (const char* op_list_buf, size_t op_list_len) {
   char* c_string;
   Py_ssize_t py_size;
   if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
diff --git a/tensorflow/python/framework/subscribe.py b/tensorflow/python/framework/subscribe.py
index 91c6e33f22c6a7cee267950b4acf7cb077e7416d..2654bca31c8b13474f8c6e547a03ba33c75260d2 100644
--- a/tensorflow/python/framework/subscribe.py
+++ b/tensorflow/python/framework/subscribe.py
@@ -276,7 +276,7 @@ def subscribe(tensors, side_effects):
     Subscribed tensors, which are identity copies of the passed in tensors
       in the same passed in structure, but the graph has been modified
       such that these are downstream of the control dependencies for
-      the side effect graphs. Use these functionally equivelant tensors
+      the side effect graphs. Use these functionally equivalent tensors
       instead of the passed in tensors for further construction or running.
   """
   if not hasattr(side_effects, '__iter__'):
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 3664710caa331a1d0960e95f7728db1c68d1706d..73c810711f4ffc96d6e5e69e8fd5a5e9f991568a 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Helper classes for tensor shape inference."""
 from __future__ import absolute_import
 from __future__ import division
@@ -31,8 +30,8 @@ class Dimension(object):
       self._value = None
     else:
       self._value = int(value)
-      if (not isinstance(value, compat.bytes_or_text_types)
-          and self._value != value):
+      if (not isinstance(value, compat.bytes_or_text_types) and
+          self._value != value):
         raise ValueError("Ambiguous dimension: %s" % value)
       if self._value < 0:
         raise ValueError("Dimension %d must be >= 0" % self._value)
@@ -89,9 +88,8 @@ class Dimension(object):
       True if this Dimension and `other` are compatible.
     """
     other = as_dimension(other)
-    return (self._value is None
-            or other.value is None
-            or self._value == other.value)
+    return (self._value is None or other.value is None or
+            self._value == other.value)
 
   def assert_is_compatible_with(self, other):
     """Raises an exception if `other` is not compatible with this Dimension.
@@ -104,8 +102,8 @@ class Dimension(object):
         is_compatible_with).
     """
     if not self.is_compatible_with(other):
-      raise ValueError("Dimensions %s and %s are not compatible"
-                       % (self, other))
+      raise ValueError("Dimensions %s and %s are not compatible" % (self,
+                                                                    other))
 
   def merge_with(self, other):
     """Returns a Dimension that combines the information in `self` and `other`.
@@ -385,18 +383,17 @@ class TensorShape(object):
   `Tensor`. It may be one of the following:
 
   * *Fully-known shape:* has a known number of dimensions and a known size
-    for each dimension.
+    for each dimension. e.g. `TensorShape([16, 256])`
   * *Partially-known shape:* has a known number of dimensions, and an unknown
-    size for one or more dimension.
+    size for one or more dimension. e.g. `TensorShape([None, 256])`
   * *Unknown shape:* has an unknown number of dimensions, and an unknown
-    size in all dimensions.
+    size in all dimensions. e.g. `TensorShape(None)`
 
   If a tensor is produced by an operation of type `"Foo"`, its shape
   may be inferred if there is a registered shape function for
-  `"Foo"`. See @{$adding_an_op#shape-functions-in-c$`Shape functions in   C++`} for
-  details of shape functions and how to register them. Alternatively,
-  the shape may be set explicitly using
-  @{tf.Tensor.set_shape}.
+  `"Foo"`. See @{$adding_an_op#shape-functions-in-c$`Shape functions in C++`}
+  for details of shape functions and how to register them. Alternatively,
+  the shape may be set explicitly using @{tf.Tensor.set_shape}.
   """
 
   def __init__(self, dims):
@@ -414,7 +411,7 @@ class TensorShape(object):
       self._dims = None
     elif isinstance(dims, compat.bytes_or_text_types):
       raise TypeError("A string has ambiguous TensorShape, please wrap in a "
-                       "list or convert to an int: %s" % dims)
+                      "list or convert to an int: %s" % dims)
     elif isinstance(dims, tensor_shape_pb2.TensorShapeProto):
       if dims.unknown_rank:
         self._dims = None
@@ -422,7 +419,8 @@ class TensorShape(object):
         self._dims = [
             # Protos store variable-size dimensions as -1
             as_dimension(dim.size if dim.size != -1 else None)
-            for dim in dims.dim]
+            for dim in dims.dim
+        ]
     elif isinstance(dims, TensorShape):
       self._dims = dims.dims
     else:
@@ -519,7 +517,7 @@ class TensorShape(object):
           # suffixes of otherwise unknown shapes.
           return unknown_shape()
         else:
-          return unknown_shape(ndims=stop-start)
+          return unknown_shape(ndims=stop - start)
       else:
         return Dimension(None)
 
@@ -560,8 +558,7 @@ class TensorShape(object):
           new_dims.append(dim.merge_with(other[i]))
         return TensorShape(new_dims)
       except ValueError:
-        raise ValueError("Shapes %s and %s are not compatible" %
-                         (self, other))
+        raise ValueError("Shapes %s and %s are not compatible" % (self, other))
 
   def concatenate(self, other):
     """Returns the concatenation of the dimension in `self` and `other`.
@@ -599,8 +596,8 @@ class TensorShape(object):
     other = as_shape(other)
     if self.ndims is not None and other.ndims is not None:
       if self.ndims != other.ndims:
-        raise ValueError(
-            "Shapes %s and %s must have the same rank" % (self, other))
+        raise ValueError("Shapes %s and %s must have the same rank" % (self,
+                                                                       other))
 
   def assert_has_rank(self, rank):
     """Raises an exception if `self` is not compatible with the given `rank`.
@@ -736,8 +733,8 @@ class TensorShape(object):
 
   def is_fully_defined(self):
     """Returns True iff `self` is fully defined in every dimension."""
-    return (self._dims is not None
-            and all(dim.value is not None for dim in self._dims))
+    return (self._dims is not None and all(dim.value is not None
+                                           for dim in self._dims))
 
   def assert_is_fully_defined(self):
     """Raises an exception if `self` is not fully defined in every dimension.
@@ -767,9 +764,10 @@ class TensorShape(object):
       return tensor_shape_pb2.TensorShapeProto(unknown_rank=True)
     else:
       return tensor_shape_pb2.TensorShapeProto(dim=[
-          tensor_shape_pb2.TensorShapeProto.Dim(
-              size=-1 if d.value is None else d.value)
-          for d in self._dims])
+          tensor_shape_pb2.TensorShapeProto.Dim(size=-1
+                                                if d.value is None else d.value)
+          for d in self._dims
+      ])
 
   def __eq__(self, other):
     """Returns True if `self` is equivalent to `other`."""
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index c3169e23a5c8df1a691bd74e3998168e21533956..ac551a6e1a4307756071335195234f20910a312b 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -250,7 +250,7 @@ class TensorFlowTestCase(googletest.TestCase):
     """Returns a unique temporary directory for the test to use.
 
     If you call this method multiple times during in a test, it will return the
-    same folder. However, accross different runs the directories will be
+    same folder. However, across different runs the directories will be
     different. This will ensure that across different runs tests will not be
     able to pollute each others environment.
     If you need multiple unique directories within a single test, you should
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index cccba94fb0fe3b6bd5603e92dcbafe9884809665..6129fa2e0d06e3ac271ace515a0e3ab8fb98ac9d 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -252,8 +252,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertArrayNear(a, b, 0.001)
 
   def testForceGPU(self):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Cannot assign a device to node"):
+    with self.assertRaises(errors.InvalidArgumentError):
       with self.test_session(force_gpu=True):
         # this relies on us not having a GPU implementation for assert, which
         # seems sensible
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..581f17c2ca21d2d1634bdbc695156f66dd1d4b35
--- /dev/null
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -0,0 +1,89 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the swig wrapper tf_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class MemoryOptimizerTest(test.TestCase):
+  """Tests the Grappler memory optimizer."""
+
+  def testNoSwapping(self):
+    """Make sure the graph is preserved when there is nothing to swap."""
+    a = constant_op.constant(10, name='a')
+    b = constant_op.constant(20, name='b')
+    c = math_ops.add_n([a, b], name='c')
+    d = math_ops.add_n([b, c], name='d')
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
+    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+
+    self.assertEqual(len(graph.node), 4)
+    self.assertItemsEqual([node.name
+                           for node in graph.node], ['a', 'b', 'c', 'd'])
+
+  def testSimpleSwap(self):
+    """Check that the swap annotations are followed."""
+    a = constant_op.constant(10, name='a')
+    b = constant_op.constant(20, name='b')
+    c = math_ops.add_n([a, b], name='c')
+    d = math_ops.add_n([b, c], name='d')
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+
+    d.op.node_def.attr['_swap_to_host'].i = 0
+
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
+    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+
+    self.assertEqual(len(graph.node), 6)
+    self.assertItemsEqual([node.name for node in graph.node], [
+        'a',
+        'b',
+        'c',
+        'd',
+        'swap_in_d_0',
+        'swap_out_d_0',
+    ])
+    for node in graph.node:
+      if node.name == 'swap_in_d_0':
+        self.assertEqual('swap_out_d_0', node.input[0])
+        self.assertEqual('^b', node.input[1])
+      elif node.name == 'swap_out_d_0':
+        self.assertEqual('b', node.input[0])
+      elif node.name == 'd':
+        self.assertEqual('swap_in_d_0', node.input[0])
+        self.assertEqual('c', node.input[1])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index ab887e63e5f6e0283ca4c57e809722d4136cc555..404ce351801464ce9941505b7b51c3b9f009ba2c 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -58,6 +58,7 @@ limitations under the License.
   #include "tensorflow/core/framework/graph.pb.h"
   #include "tensorflow/core/grappler/grappler_item.h"
   #include "tensorflow/core/grappler/grappler_item_builder.h"
+  #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
   #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
   #include "tensorflow/core/protobuf/meta_graph.pb.h"
   #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -69,9 +70,11 @@ PyObject* TF_OptimizeGraph(
     const tensorflow::grappler::ItemConfig item_config;
     std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
         tensorflow::grappler::GrapplerItemFromMetaGraphDef(graph_id, metagraph, item_config);
+    std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+    tensorflow::grappler::VirtualCluster cluster(device_map);
     tensorflow::GraphDef out_graph;
     tensorflow::Status status = tensorflow::grappler::RunMetaOptimizer(
-        *grappler_item, rewriter_config, &out_graph);
+        *grappler_item, rewriter_config, &cluster, &out_graph);
     tensorflow::Set_TF_Status_from_Status(out_status, status);
     string out_graph_str = out_graph.SerializeAsString();
     PyObject* ret = PyBytes_FromStringAndSize(out_graph_str.data(),
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 06a0aa468a1c35634373e8bd20651f5500a1b313..5d9534a206b237a4f7b89dbb4ca896bae52a2b08 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -402,6 +402,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "lookup_ops_test",
+    size = "small",
+    srcs = ["lookup_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_py_test(
     name = "losses_test",
     size = "medium",
@@ -1548,6 +1564,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -2363,7 +2380,6 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_grad",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework",
@@ -2569,6 +2585,30 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "bucketize_op_test",
+    size = "small",
+    srcs = ["bucketize_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+tf_py_test(
+    name = "sparse_cross_op_test",
+    size = "small",
+    srcs = ["sparse_cross_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:sparse_ops",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 23d8e8928327410abc4bd2d98398f78f988bc50c..7b8cd256643c27754724e5110797068cbcc6cc0d 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
@@ -239,7 +240,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
         self.assertAllEqual(x_tf, x_np)
 
   def _reverse1DimAuto(self, np_dtype):
-    x_np = np.array([1, 2, 3, 4, 5], dtype=np_dtype)
+    x_np = np.array([1, 200, 3, 40, 5], dtype=np_dtype)
 
     for use_gpu in [False, True]:
       with self.test_session(use_gpu=use_gpu):
@@ -247,7 +248,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
         self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
 
   def _reverse2DimAuto(self, np_dtype):
-    x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np_dtype)
+    x_np = np.array([[1, 200, 3], [4, 5, 60]], dtype=np_dtype)
 
     for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
       for use_gpu in [False, True]:
@@ -282,14 +283,14 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
   def testReverse1DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.int32, np.int64, np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128
+        np.float64, np.complex64, np.complex128, np.array(b"").dtype.type
     ]:
       self._reverse1DimAuto(dtype)
 
   def testReverse2DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.int32, np.int64, np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128
+        np.float64, np.complex64, np.complex128, np.array(b"").dtype.type
     ]:
       self._reverse2DimAuto(dtype)
 
@@ -807,9 +808,10 @@ class StridedSliceBenchmark(test_lib.Benchmark):
 
 class StridedSliceAssignChecker(object):
 
-  def __init__(self, test, x, tensor_type=dtypes.float32):
+  def __init__(self, test, x, tensor_type=dtypes.float32, use_resource=False):
     self.tensor_type = tensor_type
     self.test = test
+    self._use_resource = use_resource
 
     self.x_np = np.array(x).astype(tensor_type.as_numpy_dtype)
     # Give the value a non-zero imaginary component for complex types.
@@ -824,7 +826,10 @@ class StridedSliceAssignChecker(object):
       value -= 1j * value
 
     with self.test.test_session(use_gpu=True) as sess:
-      var = variables.Variable(self.x)
+      if self._use_resource:
+        var = resource_variable_ops.ResourceVariable(self.x)
+      else:
+        var = variables.Variable(self.x)
       sess.run(variables.initialize_variables([var]))
       val = sess.run(var[index].assign(value))
       # val_copy is used to check that tf.assign works equivalently to the
@@ -846,9 +851,10 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
         bar = foo[:2].assign(constant_op.constant([1, 2]))
         sess.run(bar)
 
-  def testSliceAssign(self):
+  def doTestSliceAssign(self, use_resource):
     for dtype in STRIDED_SLICE_TYPES:
       checker = StridedSliceAssignChecker(self, [[1, 2, 3], [4, 5, 6]],
+                                          use_resource=use_resource,
                                           tensor_type=dtype)
       # Check if equal
       checker[:] = [[10, 20, 30], [40, 50, 60]]
@@ -873,6 +879,12 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     checker2[...] = 6  # ellipsis
     checker2[None] = [6]  # new axis
 
+  def testSliceAssign(self):
+    self.doTestSliceAssign(use_resource=False)
+
+  def testSliceAssignResource(self):
+    self.doTestSliceAssign(use_resource=True)
+
   def testUninitialized(self):
     with self.assertRaisesRegexp(
         errors.FailedPreconditionError,
diff --git a/tensorflow/contrib/layers/python/kernel_tests/bucketization_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
similarity index 55%
rename from tensorflow/contrib/layers/python/kernel_tests/bucketization_op_test.py
rename to tensorflow/python/kernel_tests/bucketize_op_test.py
index abc6cc5674ce69fa2d7b27cdad773e9d29ee938e..ed53cc62940650c7312ea49afebf585ca2d705d0 100644
--- a/tensorflow/contrib/layers/python/kernel_tests/bucketization_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -12,35 +12,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for bucketization_op."""
+"""Tests for bucketize_op."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.layers.python.ops import bucketization_op
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class BucketizationOpTest(test.TestCase):
 
-  def test_normal_usecase(self):
-    op = bucketization_op.bucketize(
+  def testInt(self):
+    op = math_ops._bucketize(
         constant_op.constant([-5, 0, 2, 3, 5, 8, 10, 11, 12]),
         boundaries=[0, 3, 8, 11])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
     with self.test_session() as sess:
       self.assertAllEqual(expected_out, sess.run(op))
 
-  def test_invalid_boundaries_order(self):
-    op = bucketization_op.bucketize(
+  def testFloat(self):
+    op = math_ops._bucketize(
+        constant_op.constant([-5., 0., 2., 3., 5., 8., 10., 11., 12.]),
+        boundaries=[0., 3., 8., 11.])
+    expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
+    with self.test_session() as sess:
+      self.assertAllEqual(expected_out, sess.run(op))
+
+  def test2DInput(self):
+    op = math_ops._bucketize(
+        constant_op.constant([[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]),
+        boundaries=[0, 3, 8, 11])
+    expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
+    with self.test_session() as sess:
+      self.assertAllEqual(expected_out, sess.run(op))
+
+  def testInvalidBoundariesOrder(self):
+    op = math_ops._bucketize(
         constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
     with self.test_session() as sess:
-      with self.assertRaises(errors_impl.InvalidArgumentError):
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError, "Expected sorted boundaries"):
         sess.run(op)
 
+  def testBoundariesNotList(self):
+    with self.assertRaisesRegexp(
+        TypeError, "Expected list for attr boundaries"):
+      math_ops._bucketize(constant_op.constant([-5, 0]), boundaries=0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 00f6cc0d6d94b0f2eafb98481ea924fa86f1afa8..405c003cabed08c145369e4e29a2ca460c658509 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -45,6 +45,8 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -2139,6 +2141,29 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, y)[0]
       self.assertEqual(388.0, r.eval())
 
+  def testStopGradMultiFlows(self):
+    with self.test_session():
+      def body(i, y, r):
+        x = variable_scope.get_variable(
+            "x", shape=(), dtype=dtypes.float32,
+            initializer=init_ops.ones_initializer())
+        y *= x
+        return [i + 1, y, r + math_ops.reduce_sum(y)]
+
+      i0 = constant_op.constant(0)
+      y0 = array_ops.ones(5)
+      r0 = constant_op.constant(0.0)
+      cond = lambda i, y, r: i < 1
+      _, _, r = control_flow_ops.while_loop(
+          cond, body, [i0, y0, r0], back_prop=True)
+
+      vars_ = variables.global_variables()
+      grads = linalg_ops.norm(gradients_impl.gradients(r, vars_)[0])
+      z = math_ops.add(r, array_ops.stop_gradient(math_ops.reduce_sum(grads)))
+      result = gradients_impl.gradients(z, vars_)[0]
+      variables.global_variables_initializer().run()
+      self.assertEqual(5.0, result.eval())
+
   def testOneValueCond(self):
     with self.test_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 2f2f1da09051558ca4e2c48866d05d69252d1292..0846470abc6c0be452a836da93f66dea803ea5c0 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -1106,16 +1106,18 @@ class BinaryOpTest(test.TestCase):
 
   def testAtan2SpecialValues(self):
     x1l, x2l = zip((+0.0, +0.0), (+0.0, -0.0), (-0.0, +0.0), (-0.0, -0.0),
-                    (1.2345, float('inf')), (1.2345, -float('inf')),
-                    (-4.321, float('inf')), (-4.125, -float('inf')),
-                    (float('inf'), float('inf')), (float('inf'), -float('inf')),
-                    (-float('inf'), float('inf')), (-float('inf'), -float('inf')))
+                   (1.2345, float("inf")), (1.2345, -float("inf")),
+                   (-4.321, float("inf")), (-4.125, -float("inf")),
+                   (float("inf"), float("inf")), (float("inf"), -float("inf")),
+                   (-float("inf"), float("inf")), (-float("inf"),
+                                                   -float("inf")))
     for dtype in np.float32, np.float64:
       x1 = np.array(x1l).astype(dtype)
       x2 = np.array(x2l).astype(dtype)
       self._compareCpu(x1, x2, np.arctan2, math_ops.atan2)
       self._compareGpu(x1, x2, np.arctan2, math_ops.atan2)
 
+
 class ComparisonOpTest(test.TestCase):
 
   def _compareScalar(self, func, x, y, dtype):
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index 52f48c3368be8c372db581e48ed8f05927f45a34..b457b5cc866e2218d835260ee0b07395b2abf3d1 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -36,10 +36,10 @@ class DecodeImageOpTest(test.TestCase):
   def testGif(self):
     # Read some real GIFs
     path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
-    WIDTH = 20
-    HEIGHT = 40
-    STRIDE = 5
-    shape = (12, HEIGHT, WIDTH, 3)
+    width = 20
+    height = 40
+    stride = 5
+    shape = (12, height, width, 3)
 
     with self.test_session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(path)
@@ -52,13 +52,13 @@ class DecodeImageOpTest(test.TestCase):
 
       for frame_idx, frame in enumerate(image0):
         gt = np.zeros(shape[1:], dtype=np.uint8)
-        start = frame_idx * STRIDE
-        end = (frame_idx + 1) * STRIDE
-        if end <= WIDTH:
+        start = frame_idx * stride
+        end = (frame_idx + 1) * stride
+        if end <= width:
           gt[:, start:end, :] = 255
         else:
-          start -= WIDTH
-          end -= WIDTH
+          start -= width
+          end -= width
           gt[start:end, :, :] = 255
 
         self.assertAllClose(frame, gt)
@@ -79,11 +79,15 @@ class DecodeImageOpTest(test.TestCase):
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertAllEqual(image0, image1)
 
+      bad_channels = image_ops.decode_image(jpeg0, channels=4)
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        bad_channels.eval()
+
   def testPng(self):
     # Read some real PNGs, converting to different channel numbers
     inputs = [(1, "lena_gray.png")]
     for channels_in, filename in inputs:
-      for channels in 0, 1, 3:
+      for channels in 0, 1, 3, 4:
         with self.test_session(use_gpu=True) as sess:
           path = os.path.join(prefix_path, "png", "testdata", filename)
           png0 = io_ops.read_file(path)
@@ -100,11 +104,6 @@ class DecodeImageOpTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         decode.eval()
 
-  def testInvalidChannels(self):
-    image_bytes = b"unused"
-    with self.assertRaises(ValueError):
-      decode = image_ops.decode_image(image_bytes, channels=4)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index cd7216c52796f868623e94b2c61a3bde18204f08..fbaf335efb8ae3b12b2014e4298df12c2967a583 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -77,6 +77,14 @@ class DecodeRawOpTest(test.TestCase):
 
       self.assertAllEqual(expected_result, result)
 
+  def testEmptyStringInput(self):
+    with self.test_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.float16)
+
+      result = decode.eval(feed_dict={in_bytes: [""]})
+      self.assertEqual(len(result), 1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index a881ed0dc9adda6418a1712656caf2d63d24ae05..2fc34bd4d17860e57d66e5eda1218d430cfc6b4a 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -113,10 +113,9 @@ class DepthwiseConv2DTest(test.TestCase):
       total_size_1 *= s
     for s in filter_in_sizes:
       total_size_2 *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
+    # Initializes the input and filter tensor with numbers incrementing from 1.
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [1.0 for f in range(1, total_size_2 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
     with self.test_session(use_gpu=use_gpu) as sess:
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t1.set_shape(tensor_in_sizes)
@@ -147,8 +146,9 @@ class DepthwiseConv2DTest(test.TestCase):
       native_result = sess.run(conv_native)
       interface_result = sess.run(conv_interface)
 
-    print("diff matrix:",
-          np.amax(np.ravel(native_result) - np.ravel(interface_result)))
+    print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
+          ", stride:", stride, ", padding: ", padding, ", max diff: ",
+          np.amax(np.absolute(native_result - interface_result)))
     self.assertArrayNear(
         np.ravel(native_result), np.ravel(interface_result), 1e-5)
     self.assertShapeEqual(native_result, conv_native)
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..50a079520048f8b2fc1ae0769b26507bb452d8b1
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -0,0 +1,279 @@
+# Tests of TensorFlow kernels written using the Python API.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+cuda_py_test(
+    name = "bijector_test",
+    size = "small",
+    srcs = ["bijector_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "kullback_leibler_test",
+    size = "small",
+    srcs = ["kullback_leibler_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "beta_test",
+    size = "small",
+    srcs = ["beta_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "bernoulli_test",
+    size = "small",
+    srcs = ["bernoulli_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "categorical_test",
+    size = "small",
+    srcs = ["categorical_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "dirichlet_test",
+    size = "small",
+    srcs = ["dirichlet_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "dirichlet_multinomial_test",
+    size = "medium",
+    srcs = ["dirichlet_multinomial_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "exponential_test",
+    srcs = ["exponential_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "gamma_test",
+    srcs = ["gamma_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "laplace_test",
+    srcs = ["laplace_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "multinomial_test",
+    srcs = ["multinomial_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "student_t_test",
+    size = "small",
+    srcs = ["student_t_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = ["nomsan"],  # disable to avoid false positives from scipy.
+)
+
+cuda_py_test(
+    name = "uniform_test",
+    size = "small",
+    srcs = ["uniform_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "normal_test",
+    size = "medium",
+    srcs = ["normal_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "special_math_test",
+    size = "medium",
+    srcs = ["special_math_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "identity_bijector_test",
+    size = "small",
+    srcs = ["identity_bijector_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/kernel_tests/distributions/__init__.py b/tensorflow/python/kernel_tests/distributions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..94dd13c89057795d4f5182cbf25867441f76c8b3
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kernel tests for tf.distributions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
rename to tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index 87b2331a1d425aa6d5d56d074aed026ed8ef1c2b..ef93c4dab088c1e8bcb8ba1673d964eabb79835d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -18,15 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-import scipy.special
-from tensorflow.contrib.distributions.python.ops import bernoulli
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bernoulli
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
 
 
 def make_bernoulli(batch_shape, dtype=dtypes.int32):
@@ -54,13 +69,16 @@ class BernoulliTest(test.TestCase):
     with self.test_session():
       self.assertAllClose(logits, dist.logits.eval())
 
+    if not special:
+      return
+
     with self.test_session():
-      self.assertAllClose(scipy.special.expit(logits), dist.probs.eval())
+      self.assertAllClose(special.expit(logits), dist.probs.eval())
 
     p = [0.01, 0.99, 0.42]
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(scipy.special.logit(p), dist.logits.eval())
+      self.assertAllClose(special.logit(p), dist.logits.eval())
 
   def testInvalidP(self):
     invalid_ps = [1.01, 2.]
@@ -160,7 +178,9 @@ class BernoulliTest(test.TestCase):
   def testPmfWithP(self):
     p = [[0.2, 0.4], [0.3, 0.6]]
     self._testPmf(probs=p)
-    self._testPmf(logits=scipy.special.logit(p))
+    if not special:
+      return
+    self._testPmf(logits=special.logit(p))
 
   def testBroadcasting(self):
     with self.test_session():
@@ -286,7 +306,7 @@ class BernoulliTest(test.TestCase):
       a = bernoulli.Bernoulli(probs=a_p)
       b = bernoulli.Bernoulli(probs=b_p)
 
-      kl = kullback_leibler.kl(a, b)
+      kl = kullback_leibler.kl_divergence(a, b)
       kl_val = sess.run(kl)
 
       kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py b/tensorflow/python/kernel_tests/distributions/beta_test.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
rename to tensorflow/python/kernel_tests/distributions/beta_test.py
index f524986cec8d881b262a2a2009da021d7e1e91e9..91a451f033ffbb01d54c3dacce952b406564b7b4 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
+++ b/tensorflow/python/kernel_tests/distributions/beta_test.py
@@ -16,18 +16,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import special
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import beta as beta_lib
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import beta as beta_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 
 
 class BetaTest(test.TestCase):
@@ -167,18 +182,22 @@ class BetaTest(test.TestCase):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_mean = stats.beta.mean(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = stats.beta.mean(a, b)
       self.assertAllClose(expected_mean, dist.mean().eval())
 
   def testBetaVariance(self):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_variance = stats.beta.var(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = stats.beta.var(a, b)
       self.assertAllClose(expected_variance, dist.variance().eval())
 
   def testBetaMode(self):
@@ -228,9 +247,11 @@ class BetaTest(test.TestCase):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_entropy = stats.beta.entropy(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.beta.entropy(a, b)
       self.assertAllClose(expected_entropy, dist.entropy().eval())
 
   def testBetaSample(self):
@@ -243,6 +264,8 @@ class BetaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000,))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       self.assertLess(
           stats.kstest(
               # Beta is a univariate distribution.
@@ -286,6 +309,8 @@ class BetaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values[:, 1, :].mean(axis=0),
           stats.beta.mean(a, b)[1, :],
@@ -301,6 +326,8 @@ class BetaTest(test.TestCase):
         actual = beta_lib.Beta(a, b).cdf(x).eval()
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
         self.assertAllClose(stats.beta.cdf(x, a, b), actual, rtol=1e-4, atol=0)
 
   def testBetaLogCdf(self):
@@ -313,6 +340,8 @@ class BetaTest(test.TestCase):
         actual = math_ops.exp(beta_lib.Beta(a, b).log_cdf(x)).eval()
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
         self.assertAllClose(stats.beta.cdf(x, a, b), actual, rtol=1e-4, atol=0)
 
   def testBetaWithSoftplusConcentration(self):
@@ -342,6 +371,8 @@ class BetaTest(test.TestCase):
         d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp,
                                                        concentration0=b2_sp)
 
+        if not special:
+          return
         kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) +
                        (a1 - a2) * special.digamma(a1) +
                        (b1 - b2) * special.digamma(b1) +
@@ -349,13 +380,13 @@ class BetaTest(test.TestCase):
 
         for dist1 in [d1, d1_sp]:
           for dist2 in [d2, d2_sp]:
-            kl = kullback_leibler.kl(dist1, dist2)
+            kl = kullback_leibler.kl_divergence(dist1, dist2)
             kl_val = sess.run(kl)
             self.assertEqual(kl.get_shape(), shape)
             self.assertAllClose(kl_val, kl_expected)
 
         # Make sure KL(d1||d1) is 0
-        kl_same = sess.run(kullback_leibler.kl(d1, d1))
+        kl_same = sess.run(kullback_leibler.kl_divergence(d1, d1))
         self.assertAllClose(kl_same, np.zeros_like(kl_expected))
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
similarity index 95%
rename from tensorflow/contrib/distributions/python/kernel_tests/bijectors/bijector_test.py
rename to tensorflow/python/kernel_tests/distributions/bijector_test.py
index 3ba6aa529332246c8770a8f6bd32af7bd8ac049c..9f9fb5c0bb4c0e9d68ddf6034a8649ad5a6bd8e9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -22,9 +22,9 @@ import abc
 
 import six
 
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector import Bijector
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.platform import test
 
 
@@ -36,10 +36,10 @@ class BaseBijectorTest(test.TestCase):
       with self.assertRaisesRegexp(TypeError,
                                    ("Can't instantiate abstract class Bijector "
                                     "with abstract methods __init__")):
-        Bijector()
+        bijector.Bijector()  # pylint: disable=abstract-class-instantiated
 
   def testDefaults(self):
-    class _BareBonesBijector(Bijector):
+    class _BareBonesBijector(bijector.Bijector):
       """Minimal specification of a `Bijector`."""
 
       def __init__(self):
@@ -80,7 +80,7 @@ class IntentionallyMissingError(Exception):
   pass
 
 
-class BrokenBijector(Bijector):
+class BrokenBijector(bijector.Bijector):
   """Forward and inverse are not inverses of each other."""
 
   def __init__(self, forward_missing=False, inverse_missing=False):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
similarity index 97%
rename from tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
rename to tensorflow/python/kernel_tests/distributions/categorical_test.py
index 0b42581e79f15827f74749094bd50ceba5de50c2..bfdb5fa9fe7e46da0372e0f40c5b1e68e7e4963f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import categorical
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
@@ -30,6 +28,8 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
 
 
@@ -278,10 +278,10 @@ class CategoricalTest(test.TestCase):
           a = categorical.Categorical(logits=a_logits)
           b = categorical.Categorical(logits=b_logits)
 
-          kl = kullback_leibler.kl(a, b)
+          kl = kullback_leibler.kl_divergence(a, b)
           kl_val = sess.run(kl)
           # Make sure KL(a||a) is 0
-          kl_same = sess.run(kullback_leibler.kl(a, a))
+          kl_same = sess.run(kullback_leibler.kl_divergence(a, a))
 
           prob_a = np_softmax(a_logits)
           prob_b = np_softmax(b_logits)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
similarity index 99%
rename from tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
rename to tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index bc25366cfa4f1db459ee8a83d26be9ddeda975ef..d009f4e9319293c636f90a76d49f8b90d473cb0d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -17,14 +17,15 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import dirichlet_multinomial
 from tensorflow.python.platform import test
 
-ds = distributions
+
+ds = dirichlet_multinomial
 
 
 class DirichletMultinomialTest(test.TestCase):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
similarity index 94%
rename from tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py
rename to tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index cd634da09dde3227cb09ef68150790fc67eec747..a2f1de5aaf3a75c1cfac820cc4494af34d082250 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -16,14 +16,29 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import dirichlet as dirichlet_lib
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import dirichlet as dirichlet_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class DirichletTest(test.TestCase):
@@ -132,9 +147,11 @@ class DirichletTest(test.TestCase):
   def testMean(self):
     with self.test_session():
       alpha = [1., 2, 3]
-      expected_mean = stats.dirichlet.mean(alpha)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.mean().get_shape(), [3])
+      if not stats:
+        return
+      expected_mean = stats.dirichlet.mean(alpha)
       self.assertAllClose(dirichlet.mean().eval(), expected_mean)
 
   def testCovarianceFromSampling(self):
@@ -177,11 +194,13 @@ class DirichletTest(test.TestCase):
     with self.test_session():
       alpha = [1., 2, 3]
       denominator = np.sum(alpha)**2 * (np.sum(alpha) + 1)
+      dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
+      self.assertEqual(dirichlet.covariance().get_shape(), (3, 3))
+      if not stats:
+        return
       expected_covariance = np.diag(stats.dirichlet.var(alpha))
       expected_covariance += [[0., -2, -3], [-2, 0, -6],
                               [-3, -6, 0]] / denominator
-      dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
-      self.assertEqual(dirichlet.covariance().get_shape(), (3, 3))
       self.assertAllClose(dirichlet.covariance().eval(), expected_covariance)
 
   def testMode(self):
@@ -213,9 +232,11 @@ class DirichletTest(test.TestCase):
   def testEntropy(self):
     with self.test_session():
       alpha = [1., 2, 3]
-      expected_entropy = stats.dirichlet.entropy(alpha)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.entropy().get_shape(), ())
+      if not stats:
+        return
+      expected_entropy = stats.dirichlet.entropy(alpha)
       self.assertAllClose(dirichlet.entropy().eval(), expected_entropy)
 
   def testSample(self):
@@ -227,6 +248,8 @@ class DirichletTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertTrue(np.all(sample_values > 0.0))
+      if not stats:
+        return
       self.assertLess(
           stats.kstest(
               # Beta is a univariate distribution.
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py b/tensorflow/python/kernel_tests/distributions/exponential_test.py
similarity index 88%
rename from tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
rename to tensorflow/python/kernel_tests/distributions/exponential_test.py
index 617120241383c1574ae88ce5b7ee5a95bbc94eba..7afdf0f947605c6b982e8bf7defdd6224180e089 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
+++ b/tensorflow/python/kernel_tests/distributions/exponential_test.py
@@ -18,13 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import exponential as exponential_lib
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import exponential as exponential_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class ExponentialTest(test.TestCase):
@@ -36,14 +51,17 @@ class ExponentialTest(test.TestCase):
       lam_v = 2.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       exponential = exponential_lib.Exponential(rate=lam)
-      expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
 
       log_pdf = exponential.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
 
       pdf = exponential.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
+
+      if not stats:
+        return
+      expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
+      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testExponentialCDF(self):
@@ -54,34 +72,43 @@ class ExponentialTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       exponential = exponential_lib.Exponential(rate=lam)
-      expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
 
       cdf = exponential.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+
+      if not stats:
+        return
+      expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testExponentialMean(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_mean = stats.expon.mean(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = stats.expon.mean(scale=1 / lam_v)
       self.assertAllClose(exponential.mean().eval(), expected_mean)
 
   def testExponentialVariance(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_variance = stats.expon.var(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = stats.expon.var(scale=1 / lam_v)
       self.assertAllClose(exponential.variance().eval(), expected_variance)
 
   def testExponentialEntropy(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_entropy = stats.expon.entropy(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.expon.entropy(scale=1 / lam_v)
       self.assertAllClose(exponential.entropy().eval(), expected_entropy)
 
   def testExponentialSample(self):
@@ -95,6 +122,8 @@ class ExponentialTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       for i in range(2):
         self.assertLess(
             stats.kstest(
@@ -116,6 +145,8 @@ class ExponentialTest(test.TestCase):
       sample_values = samples.eval()
 
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       for i in range(2):
         self.assertLess(
             stats.kstest(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py b/tensorflow/python/kernel_tests/distributions/gamma_test.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py
rename to tensorflow/python/kernel_tests/distributions/gamma_test.py
index fd627102372d1fc9c19646729a715290a5851449..5e4813ac0762d2855d7fbe6754fe1466c29c06c9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py
+++ b/tensorflow/python/kernel_tests/distributions/gamma_test.py
@@ -17,18 +17,32 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import special
-from scipy import stats
 
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 
 
 class GammaTest(test.TestCase):
@@ -53,13 +67,14 @@ class GammaTest(test.TestCase):
       beta_v = 3.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
-
       pdf = gamma.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testGammaLogPDFMultidimensional(self):
@@ -71,15 +86,16 @@ class GammaTest(test.TestCase):
       beta_v = np.array([3.0, 4.0])
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
-
       pdf = gamma.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testGammaLogPDFMultidimensionalBroadcasting(self):
@@ -91,15 +107,17 @@ class GammaTest(test.TestCase):
       beta_v = 3.0
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
-
       pdf = gamma.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testGammaCDF(self):
@@ -112,10 +130,11 @@ class GammaTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_cdf = stats.gamma.cdf(x, alpha_v, scale=1 / beta_v)
-
       cdf = gamma.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.gamma.cdf(x, alpha_v, scale=1 / beta_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testGammaMean(self):
@@ -123,8 +142,10 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_means = stats.gamma.mean(alpha_v, scale=1 / beta_v)
       self.assertEqual(gamma.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_means = stats.gamma.mean(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.mean().eval(), expected_means)
 
   def testGammaModeAllowNanStatsIsFalseWorksWhenAllBatchMembersAreDefined(self):
@@ -165,8 +186,10 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_variances = stats.gamma.var(alpha_v, scale=1 / beta_v)
       self.assertEqual(gamma.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variances = stats.gamma.var(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.variance().eval(), expected_variances)
 
   def testGammaStd(self):
@@ -174,17 +197,21 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_stddev = stats.gamma.std(alpha_v, scale=1. / beta_v)
       self.assertEqual(gamma.stddev().get_shape(), (3,))
+      if not stats:
+        return
+      expected_stddev = stats.gamma.std(alpha_v, scale=1. / beta_v)
       self.assertAllClose(gamma.stddev().eval(), expected_stddev)
 
   def testGammaEntropy(self):
     with self.test_session():
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
-      expected_entropy = stats.gamma.entropy(alpha_v, scale=1 / beta_v)
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
       self.assertEqual(gamma.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.gamma.entropy(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.entropy().eval(), expected_entropy)
 
   def testGammaSampleSmallAlpha(self):
@@ -199,6 +226,9 @@ class GammaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.gamma.mean(
@@ -208,7 +238,6 @@ class GammaTest(test.TestCase):
           sample_values.var(),
           stats.gamma.var(alpha_v, scale=1 / beta_v),
           atol=.15)
-      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
 
   def testGammaSample(self):
     with session.Session():
@@ -222,6 +251,9 @@ class GammaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.gamma.mean(
@@ -231,7 +263,6 @@ class GammaTest(test.TestCase):
           sample_values.var(),
           stats.gamma.var(alpha_v, scale=1 / beta_v),
           atol=.15)
-      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
 
   def testGammaSampleMultiDimensional(self):
     with session.Session():
@@ -246,6 +277,8 @@ class GammaTest(test.TestCase):
       zeros = np.zeros_like(alpha_v + beta_v)  # 10 x 100
       alpha_bc = alpha_v + zeros
       beta_bc = beta_v + zeros
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(axis=0),
           stats.gamma.mean(
@@ -266,6 +299,8 @@ class GammaTest(test.TestCase):
 
   def _kstest(self, alpha, beta, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
+    if not stats:
+      return True  # If we can't test, return that the test passes.
     ks, _ = stats.kstest(samples, stats.gamma(alpha, scale=1 / beta).cdf)
     # Return True when the test passes.
     return ks < 0.02
@@ -279,6 +314,12 @@ class GammaTest(test.TestCase):
       sample_vals, pdf_vals = sess.run([samples, pdfs])
       self.assertEqual(samples.get_shape(), (num, 2, 2))
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertAllClose(
           stats.gamma.mean(
               [[7., 11.], [7., 11.]], scale=1 / np.array([[5., 5.], [6., 6.]])),
@@ -289,10 +330,6 @@ class GammaTest(test.TestCase):
                           scale=1 / np.array([[5., 5.], [6., 6.]])),
           sample_vals.var(axis=0),
           atol=.1)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -345,11 +382,15 @@ class GammaTest(test.TestCase):
       g1 = gamma_lib.Gamma(concentration=alpha1, rate=beta1)
       x = g0.sample(int(1e4), seed=0)
       kl_sample = math_ops.reduce_mean(g0.log_prob(x) - g1.log_prob(x), 0)
-      kl_actual = kullback_leibler.kl(g0, g1)
+      kl_actual = kullback_leibler.kl_divergence(g0, g1)
 
     # Execute graph.
     [kl_sample_, kl_actual_] = sess.run([kl_sample, kl_actual])
 
+    self.assertEqual(beta0.shape, kl_actual.get_shape())
+
+    if not special:
+      return
     kl_expected = ((alpha0 - alpha1) * special.digamma(alpha0)
                    + special.gammaln(alpha1)
                    - special.gammaln(alpha0)
@@ -357,7 +398,6 @@ class GammaTest(test.TestCase):
                    - alpha1 * np.log(beta1)
                    + alpha0 * (beta1 / beta0 - 1.))
 
-    self.assertEqual(beta0.shape, kl_actual.get_shape())
     self.assertAllClose(kl_expected, kl_actual_, atol=0., rtol=1e-6)
     self.assertAllClose(kl_sample_, kl_actual_, atol=0., rtol=1e-2)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/identity_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
similarity index 84%
rename from tensorflow/contrib/distributions/python/kernel_tests/bijectors/identity_test.py
rename to tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
index 0969c293d408ac0d643a9718ff7850571c6558eb..e8f9d0b728d8f831becc82cdba0ae2bf3d5da52a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/identity_test.py
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
-from tensorflow.contrib.distributions.python.ops.bijectors.identity import Identity
+from tensorflow.python.ops.distributions import bijector_test_util
+from tensorflow.python.ops.distributions import identity_bijector
 from tensorflow.python.platform import test
 
 
@@ -28,7 +28,7 @@ class IdentityBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      bijector = Identity()
+      bijector = identity_bijector.Identity()
       self.assertEqual("identity", bijector.name)
       x = [[[0.], [1.]]]
       self.assertAllEqual(x, bijector.forward(x).eval())
@@ -38,8 +38,8 @@ class IdentityBijectorTest(test.TestCase):
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = Identity()
-      assert_scalar_congruency(
+      bijector = identity_bijector.Identity()
+      bijector_test_util.assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
similarity index 76%
rename from tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py
rename to tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index 6b3d886e01b92099bb6476779564f7a5953d550d..b1d8da771612fe42a153a1a11b6cb26bdcb983a0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.platform import test
 
 # pylint: disable=protected-access
@@ -43,7 +43,7 @@ class KLTest(test.TestCase):
       return name
 
     a = MyDist(loc=0.0, scale=1.0)
-    self.assertEqual("OK", kullback_leibler.kl(a, a, name="OK"))
+    self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
 
   def testDomainErrorExceptions(self):
 
@@ -60,11 +60,11 @@ class KLTest(test.TestCase):
 
     with self.test_session():
       a = MyDistException(loc=0.0, scale=1.0)
-      kl = kullback_leibler.kl(a, a, allow_nan_stats=False)
+      kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
         kl.eval()
-      kl_ok = kullback_leibler.kl(a, a)
+      kl_ok = kullback_leibler.kl_divergence(a, a)
       self.assertAllEqual([float("nan")], kl_ok.eval())
 
   def testRegistrationFailures(self):
@@ -116,16 +116,16 @@ class KLTest(test.TestCase):
     sub2 = Sub2(loc=0.0, scale=1.0)
     sub11 = Sub11(loc=0.0, scale=1.0)
 
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub1, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl(sub1, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl(sub2, sub1))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub11, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl(sub11, sub2))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl(sub11, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl(sub2, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub1, sub11))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1))
+    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2))
+    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub11))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
+    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
+    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
+    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py b/tensorflow/python/kernel_tests/distributions/laplace_test.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py
rename to tensorflow/python/kernel_tests/distributions/laplace_test.py
index 1f58d495f02f5d9f894ff4cccc3ae6f32b21441b..55577386c450c7ac63f62c8a6dfd277af50e2387 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py
+++ b/tensorflow/python/kernel_tests/distributions/laplace_test.py
@@ -17,15 +17,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import laplace as laplace_lib
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import laplace as laplace_lib
 from tensorflow.python.platform import test
 
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
+
 
 class LaplaceTest(test.TestCase):
 
@@ -49,9 +65,11 @@ class LaplaceTest(test.TestCase):
       scale_v = 3.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       self.assertAllClose(log_pdf.eval(), expected_log_pdf)
 
       pdf = laplace.prob(x)
@@ -67,15 +85,17 @@ class LaplaceTest(test.TestCase):
       scale_v = np.array([3.0, 4.0])
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
 
       pdf = laplace.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testLaplaceLogPDFMultidimensionalBroadcasting(self):
@@ -87,15 +107,17 @@ class LaplaceTest(test.TestCase):
       scale_v = 3.0
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
 
       pdf = laplace.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testLaplaceCDF(self):
@@ -108,10 +130,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
 
       cdf = laplace.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testLaplaceLogCDF(self):
@@ -124,10 +148,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([-2.5, 2.5, -4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
 
       cdf = laplace.log_cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testLaplaceLogSurvivalFunction(self):
@@ -140,10 +166,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([-2.5, 2.5, -4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
 
       sf = laplace.log_survival_function(x)
       self.assertEqual(sf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
       self.assertAllClose(sf.eval(), expected_sf)
 
   def testLaplaceMean(self):
@@ -151,8 +179,10 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_means = stats.laplace.mean(loc_v, scale=scale_v)
       self.assertEqual(laplace.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_means = stats.laplace.mean(loc_v, scale=scale_v)
       self.assertAllClose(laplace.mean().eval(), expected_means)
 
   def testLaplaceMode(self):
@@ -168,8 +198,10 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_variances = stats.laplace.var(loc_v, scale=scale_v)
       self.assertEqual(laplace.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variances = stats.laplace.var(loc_v, scale=scale_v)
       self.assertAllClose(laplace.variance().eval(), expected_variances)
 
   def testLaplaceStd(self):
@@ -177,17 +209,21 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
       self.assertEqual(laplace.stddev().get_shape(), (3,))
+      if not stats:
+        return
+      expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
       self.assertAllClose(laplace.stddev().eval(), expected_stddev)
 
   def testLaplaceEntropy(self):
     with self.test_session():
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
-      expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
       self.assertEqual(laplace.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
       self.assertAllClose(laplace.entropy().eval(), expected_entropy)
 
   def testLaplaceSample(self):
@@ -202,6 +238,8 @@ class LaplaceTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.laplace.mean(
@@ -228,6 +266,8 @@ class LaplaceTest(test.TestCase):
       zeros = np.zeros_like(loc_v + scale_v)  # 10 x 100
       loc_bc = loc_v + zeros
       scale_bc = scale_v + zeros
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(axis=0),
           stats.laplace.mean(
@@ -250,6 +290,8 @@ class LaplaceTest(test.TestCase):
 
   def _kstest(self, loc, scale, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
+    if not stats:
+      return True  # If scipy isn't available, return "True" for passing
     ks, _ = stats.kstest(samples, stats.laplace(loc, scale=scale).cdf)
     # Return True when the test passes.
     return ks < 0.02
@@ -263,6 +305,12 @@ class LaplaceTest(test.TestCase):
       sample_vals, pdf_vals = sess.run([samples, pdfs])
       self.assertEqual(samples.get_shape(), (num, 2, 2))
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertAllClose(
           stats.laplace.mean(
               [[7., 11.], [7., 11.]], scale=np.array([[5., 5.], [6., 6.]])),
@@ -275,10 +323,6 @@ class LaplaceTest(test.TestCase):
           sample_vals.var(axis=0),
           rtol=0.05,
           atol=0.)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
similarity index 87%
rename from tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
rename to tensorflow/python/kernel_tests/distributions/multinomial_test.py
index b1c0c9f7a9db849acc9cd6c9446ed5fed02e603d..80caf10391d7e9e9735b71a48c6676812f4d637e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -17,15 +17,14 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import multinomial
 from tensorflow.python.platform import test
 
-ds = distributions
-
 
 class MultinomialTest(test.TestCase):
 
@@ -35,7 +34,7 @@ class MultinomialTest(test.TestCase):
   def testSimpleShapes(self):
     with self.test_session():
       p = [.1, .3, .6]
-      dist = ds.Multinomial(total_count=1., probs=p)
+      dist = multinomial.Multinomial(total_count=1., probs=p)
       self.assertEqual(3, dist.event_shape_tensor().eval())
       self.assertAllEqual([], dist.batch_shape_tensor().eval())
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
@@ -45,7 +44,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
       n = [[3., 2], [4, 5], [6, 7]]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual(2, dist.event_shape_tensor().eval())
       self.assertAllEqual([3, 2], dist.batch_shape_tensor().eval())
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
@@ -55,14 +54,14 @@ class MultinomialTest(test.TestCase):
     p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
     n = [[3.], [4]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual((2, 1), dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
   def testP(self):
     p = [[0.1, 0.2, 0.7]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=3., probs=p)
+      dist = multinomial.Multinomial(total_count=3., probs=p)
       self.assertEqual((1, 3), dist.probs.get_shape())
       self.assertEqual((1, 3), dist.logits.get_shape())
       self.assertAllClose(p, dist.probs.eval())
@@ -71,7 +70,7 @@ class MultinomialTest(test.TestCase):
     p = np.array([[0.1, 0.2, 0.7]], dtype=np.float32)
     logits = np.log(p) - 50.
     with self.test_session():
-      multinom = ds.Multinomial(total_count=3., logits=logits)
+      multinom = multinomial.Multinomial(total_count=3., logits=logits)
       self.assertEqual((1, 3), multinom.probs.get_shape())
       self.assertEqual((1, 3), multinom.logits.get_shape())
       self.assertAllClose(p, multinom.probs.eval())
@@ -81,7 +80,7 @@ class MultinomialTest(test.TestCase):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=n, probs=p, validate_args=True)
+      dist = multinomial.Multinomial(total_count=n, probs=p, validate_args=True)
       dist.prob([2., 3, 0]).eval()
       dist.prob([3., 0, 2]).eval()
       with self.assertRaisesOpError("must be non-negative"):
@@ -94,7 +93,8 @@ class MultinomialTest(test.TestCase):
     n = [[5.]]
     with self.test_session():
       # No errors with integer n.
-      multinom = ds.Multinomial(total_count=n, probs=p, validate_args=True)
+      multinom = multinomial.Multinomial(
+          total_count=n, probs=p, validate_args=True)
       multinom.prob([2., 1, 2]).eval()
       multinom.prob([3., 0, 2]).eval()
       # Counts don't sum to n.
@@ -106,7 +106,8 @@ class MultinomialTest(test.TestCase):
           "cannot contain fractional components."):
         multinom.prob(x).eval(feed_dict={x: [1.0, 2.5, 1.5]})
 
-      multinom = ds.Multinomial(total_count=n, probs=p, validate_args=False)
+      multinom = multinomial.Multinomial(
+          total_count=n, probs=p, validate_args=False)
       multinom.prob([1., 2., 2.]).eval()
       # Non-integer arguments work.
       multinom.prob([1.0, 2.5, 1.5]).eval()
@@ -116,7 +117,7 @@ class MultinomialTest(test.TestCase):
       # Both zero-batches.  No broadcast
       p = [0.5, 0.5]
       counts = [1., 0]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(0.5, pmf.eval())
       self.assertEqual((), pmf.get_shape())
 
@@ -125,7 +126,7 @@ class MultinomialTest(test.TestCase):
       # Both zero-batches.  No broadcast
       p = [0.1, 0.9]
       counts = [3., 2]
-      dist = ds.Multinomial(total_count=5., probs=p)
+      dist = multinomial.Multinomial(total_count=5., probs=p)
       pmf = dist.prob(counts)
       # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
       self.assertAllClose(81. / 10000, pmf.eval())
@@ -135,7 +136,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9]]
       counts = [[1., 0], [0, 1]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose([0.1, 0.9], pmf.eval())
       self.assertEqual((2), pmf.get_shape())
 
@@ -143,7 +144,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [0.1, 0.9]
       counts = [[1., 0], [0, 1]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose([0.1, 0.9], pmf.eval())
       self.assertEqual((2), pmf.get_shape())
 
@@ -151,7 +152,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [[1., 0]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual((2), pmf.get_shape())
 
@@ -159,7 +160,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [1., 0]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual(pmf.get_shape(), (2))
 
@@ -171,7 +172,7 @@ class MultinomialTest(test.TestCase):
       n = [[3., 3], [3, 3]]
       # [2]
       counts = [2., 1]
-      pmf = ds.Multinomial(total_count=n, probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
       pmf.eval()
       self.assertEqual(pmf.get_shape(), (2, 2))
 
@@ -180,7 +181,7 @@ class MultinomialTest(test.TestCase):
       p = [0.1, 0.9]
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
-      pmf = ds.Multinomial(total_count=n, probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
       pmf.eval()
       self.assertEqual((4, 3), pmf.get_shape())
 
@@ -188,7 +189,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       expected_means = 5 * np.array(p, dtype=np.float32)
       self.assertEqual((3,), dist.mean().get_shape())
       self.assertAllClose(expected_means, dist.mean().eval())
@@ -197,7 +198,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       expected_covariances = [[9. / 20, -1 / 10, -7 / 20],
                               [-1 / 10, 4 / 5, -7 / 10],
                               [-7 / 20, -7 / 10, 21 / 20]]
@@ -210,7 +211,7 @@ class MultinomialTest(test.TestCase):
       n = [5.] * 2
       # Shape [4, 1, 2]
       p = [[[0.1, 0.9]], [[0.1, 0.9]]] * 2
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       # Shape [2, 2]
       inner_var = [[9. / 20, -9 / 20], [-9 / 20, 9 / 20]]
       # Shape [4, 2, 2, 2]
@@ -228,8 +229,8 @@ class MultinomialTest(test.TestCase):
     ns2 = np.random.randint(low=1, high=11, size=[6, 1]).astype(np.float32)
 
     with self.test_session():
-      dist = ds.Multinomial(ns, p)
-      dist2 = ds.Multinomial(ns2, p2)
+      dist = multinomial.Multinomial(ns, p)
+      dist2 = multinomial.Multinomial(ns2, p2)
 
       covariance = dist.covariance()
       covariance2 = dist2.covariance()
@@ -246,7 +247,8 @@ class MultinomialTest(test.TestCase):
     # doesn't support different total counts.
     n = np.float32(5)
     with self.test_session() as sess:
-      dist = ds.Multinomial(n, theta)  # batch_shape=[2], event_shape=[3]
+      # batch_shape=[2], event_shape=[3]
+      dist = multinomial.Multinomial(n, theta)
       x = dist.sample(int(250e3), seed=1)
       sample_mean = math_ops.reduce_mean(x, 0)
       x_centered = x - sample_mean[array_ops.newaxis, ...]
@@ -281,7 +283,7 @@ class MultinomialTest(test.TestCase):
 
   def testSampleUnbiasedNonScalarBatch(self):
     with self.test_session() as sess:
-      dist = ds.Multinomial(
+      dist = multinomial.Multinomial(
           total_count=5.,
           logits=math_ops.log(2. * self._rng.rand(4, 3, 2).astype(np.float32)))
       n = int(3e3)
@@ -310,7 +312,7 @@ class MultinomialTest(test.TestCase):
 
   def testSampleUnbiasedScalarBatch(self):
     with self.test_session() as sess:
-      dist = ds.Multinomial(
+      dist = multinomial.Multinomial(
           total_count=5.,
           logits=math_ops.log(2. * self._rng.rand(4).astype(np.float32)))
       n = int(5e3)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/normal_test.py b/tensorflow/python/kernel_tests/distributions/normal_test.py
similarity index 96%
rename from tensorflow/contrib/distributions/python/kernel_tests/normal_test.py
rename to tensorflow/python/kernel_tests/distributions/normal_test.py
index bda06df0f79e681db9ccc7942185fd0b43257c63..07c7d6d11d0f3bcecfd1029295d3249c3ea8584b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/normal_test.py
+++ b/tensorflow/python/kernel_tests/distributions/normal_test.py
@@ -18,13 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
 import math
 
 import numpy as np
-from scipy import stats
 
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal as normal_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,7 +31,21 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+stats = try_import("scipy.stats")
 
 
 class NormalTest(test.TestCase):
@@ -90,10 +102,8 @@ class NormalTest(test.TestCase):
       sigma = constant_op.constant([math.sqrt(10.0)] * batch_size)
       x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
 
       log_pdf = normal.log_prob(x)
-      self.assertAllClose(expected_log_pdf, log_pdf.eval())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           log_pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
@@ -102,12 +112,17 @@ class NormalTest(test.TestCase):
       self.assertAllEqual(normal.batch_shape, log_pdf.eval().shape)
 
       pdf = normal.prob(x)
-      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.eval().shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
       self.assertAllEqual(normal.batch_shape, pdf.eval().shape)
 
+      if not stats:
+        return
+      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
   def testNormalLogPDFMultidimensional(self):
     with self.test_session():
       batch_size = 6
@@ -116,12 +131,10 @@ class NormalTest(test.TestCase):
                                    batch_size)
       x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
 
       log_pdf = normal.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(expected_log_pdf, log_pdf_values)
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           log_pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
@@ -132,12 +145,17 @@ class NormalTest(test.TestCase):
       pdf = normal.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
-      self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf_values.shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
       self.assertAllEqual(normal.batch_shape, pdf_values.shape)
 
+      if not stats:
+        return
+      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf_values)
+      self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
+
   def testNormalCDF(self):
     with self.test_session():
       batch_size = 50
@@ -146,14 +164,15 @@ class NormalTest(test.TestCase):
       x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_cdf = stats.norm(mu, sigma).cdf(x)
-
       cdf = normal.cdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
       self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
+      if not stats:
+        return
+      expected_cdf = stats.norm(mu, sigma).cdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
 
   def testNormalSurvivalFunction(self):
     with self.test_session():
@@ -163,14 +182,16 @@ class NormalTest(test.TestCase):
       x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_sf = stats.norm(mu, sigma).sf(x)
 
       sf = normal.survival_function(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
       self.assertAllEqual(normal.batch_shape, sf.eval().shape)
+      if not stats:
+        return
+      expected_sf = stats.norm(mu, sigma).sf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0)
 
   def testNormalLogCDF(self):
     with self.test_session():
@@ -180,15 +201,18 @@ class NormalTest(test.TestCase):
       x = np.linspace(-100.0, 10.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_cdf = stats.norm(mu, sigma).logcdf(x)
 
       cdf = normal.log_cdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
       self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
 
+      if not stats:
+        return
+      expected_cdf = stats.norm(mu, sigma).logcdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
+
   def testFiniteGradientAtDifficultPoints(self):
     for dtype in [np.float32, np.float64]:
       g = ops.Graph()
@@ -217,15 +241,18 @@ class NormalTest(test.TestCase):
       x = np.linspace(-10.0, 100.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_sf = stats.norm(mu, sigma).logsf(x)
 
       sf = normal.log_survival_function(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
       self.assertAllEqual(normal.batch_shape, sf.eval().shape)
 
+      if not stats:
+        return
+      expected_sf = stats.norm(mu, sigma).logsf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
+
   def testNormalEntropyWithScalarInputs(self):
     # Scipy.stats.norm cannot deal with the shapes in the other test.
     with self.test_session():
@@ -233,16 +260,18 @@ class NormalTest(test.TestCase):
       sigma_v = 4.56
       normal = normal_lib.Normal(loc=mu_v, scale=sigma_v)
 
-      # scipy.stats.norm cannot deal with these shapes.
-      expected_entropy = stats.norm(mu_v, sigma_v).entropy()
       entropy = normal.entropy()
-      self.assertAllClose(expected_entropy, entropy.eval())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           entropy.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           entropy.eval().shape)
       self.assertAllEqual(normal.batch_shape, entropy.get_shape())
       self.assertAllEqual(normal.batch_shape, entropy.eval().shape)
+      # scipy.stats.norm cannot deal with these shapes.
+      if not stats:
+        return
+      expected_entropy = stats.norm(mu_v, sigma_v).entropy()
+      self.assertAllClose(expected_entropy, entropy.eval())
 
   def testNormalEntropy(self):
     with self.test_session():
@@ -288,15 +317,18 @@ class NormalTest(test.TestCase):
       p = np.hstack((p, np.exp(-33), 1. - np.exp(-33)))
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_x = stats.norm(mu, sigma).ppf(p)
       x = normal.quantile(p)
 
-      self.assertAllClose(expected_x, x.eval(), atol=0.)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), x.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), x.eval().shape)
       self.assertAllEqual(normal.batch_shape, x.get_shape())
       self.assertAllEqual(normal.batch_shape, x.eval().shape)
 
+      if not stats:
+        return
+      expected_x = stats.norm(mu, sigma).ppf(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
   def _baseQuantileFiniteGradientAtDifficultPoints(self, dtype):
     g = ops.Graph()
     with g.as_default():
@@ -450,7 +482,7 @@ class NormalTest(test.TestCase):
       n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a)
       n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b)
 
-      kl = kullback_leibler.kl(n_a, n_b)
+      kl = kullback_leibler.kl_divergence(n_a, n_b)
       kl_val = sess.run(kl)
 
       kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * (
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
similarity index 95%
rename from tensorflow/contrib/distributions/python/kernel_tests/special_math_test.py
rename to tensorflow/python/kernel_tests/distributions/special_math_test.py
index 6ccbeb71222ab4e77d8c86ed011bda8cb1ac3a07..dc462bae56b5fbc18036e80f6bbd4177b7b9fff2 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -19,18 +19,30 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import importlib
 
 import numpy as np
-from scipy import special
-from scipy import stats
 
-from tensorflow.contrib.distributions.python.ops import special_math
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import special_math
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
 
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 sm = special_math
 
 
@@ -66,6 +78,9 @@ class NdtriTest(test.TestCase):
   def testNdtri(self):
     """Verifies that ndtri computation is correct."""
     with self.test_session():
+      if not special:
+        return
+
       p = np.linspace(0., 1.0, 50).astype(np.float64)
       # Quantile performs piecewise rational approximation so adding some
       # special input values to make sure we hit all the pieces.
@@ -113,6 +128,9 @@ class NdtrTest(test.TestCase):
       self._test_grid_no_log(dtype, grid_spec, error_spec)
 
   def _test_grid_log(self, dtype, grid_spec, error_spec):
+    if not special:
+      return
+
     with self.test_session():
       grid = _make_grid(dtype, grid_spec)
       actual = sm.log_ndtr(grid).eval()
@@ -137,6 +155,9 @@ class NdtrTest(test.TestCase):
           atol=error_spec.atol)
 
   def _test_grid_no_log(self, dtype, grid_spec, error_spec):
+    if not special:
+      return
+
     with self.test_session():
       grid = _make_grid(dtype, grid_spec)
       actual = sm.ndtr(grid).eval()
@@ -267,6 +288,9 @@ class NdtrGradientTest(test.TestCase):
       self.assert_all_true(np.isfinite(grad_eval))
 
       # Versus scipy.
+      if not (special and stats):
+        return
+
       expected = stats.norm.pdf(raw_grid)
       if self._use_log:
         expected /= special.ndtr(raw_grid)
@@ -323,6 +347,9 @@ class LogCDFLaplaceTest(test.TestCase):
       _check_strictly_increasing(actual)
 
       # Versus scipy.
+      if not stats:
+        return
+
       scipy_dist = stats.laplace(loc=0., scale=1.)
       expected = scipy_dist.logcdf(grid.astype(scipy_dtype))
       self.assertAllClose(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py b/tensorflow/python/kernel_tests/distributions/student_t_test.py
similarity index 83%
rename from tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py
rename to tensorflow/python/kernel_tests/distributions/student_t_test.py
index 209ef696caa96411210a054dd473da88db80c76f..f1150de58e0dae5da25f74f95fb391c340a01262 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py
+++ b/tensorflow/python/kernel_tests/distributions/student_t_test.py
@@ -18,19 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
 import math
 
 import numpy as np
-from scipy import stats
-from tensorflow.contrib import distributions
-from tensorflow.contrib.distributions.python.ops import student_t
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import student_t
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
 
-ds = distributions
+
+stats = try_import("scipy.stats")
 
 
 class StudentTTest(test.TestCase):
@@ -45,7 +56,7 @@ class StudentTTest(test.TestCase):
       mu_v = 7.
       sigma_v = 8.
       t = np.array([-2.5, 2.5, 8., 0., -1., 2.], dtype=np.float32)
-      student = ds.StudentT(df, loc=mu, scale=-sigma)
+      student = student_t.StudentT(df, loc=mu, scale=-sigma)
 
       log_pdf = student.log_prob(t)
       self.assertEquals(log_pdf.get_shape(), (6,))
@@ -54,6 +65,9 @@ class StudentTTest(test.TestCase):
       self.assertEquals(pdf.get_shape(), (6,))
       pdf_values = pdf.eval()
 
+      if not stats:
+        return
+
       expected_log_pdf = stats.t.logpdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_pdf = stats.t.pdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
@@ -72,13 +86,16 @@ class StudentTTest(test.TestCase):
       mu_v = np.array([3., -3.])
       sigma_v = np.array([np.sqrt(10.), np.sqrt(15.)])
       t = np.array([[-2.5, 2.5, 4., 0., -1., 2.]], dtype=np.float32).T
-      student = ds.StudentT(df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df, loc=mu, scale=sigma)
       log_pdf = student.log_prob(t)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
       pdf = student.prob(t)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+
+      if not stats:
+        return
       expected_log_pdf = stats.t.logpdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_pdf = stats.t.pdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
@@ -105,6 +122,8 @@ class StudentTTest(test.TestCase):
       self.assertEquals(cdf.get_shape(), (6,))
       cdf_values = cdf.eval()
 
+      if not stats:
+        return
       expected_log_cdf = stats.t.logcdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_cdf = stats.t.cdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_cdf, log_cdf_values, atol=0., rtol=1e-5)
@@ -119,7 +138,7 @@ class StudentTTest(test.TestCase):
     mu_v = np.array([[1., -1, 0]])  # 1x3
     sigma_v = np.array([[1., -2., 3.]]).T  # transposed => 3x1
     with self.test_session():
-      student = ds.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
+      student = student_t.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
       ent = student.entropy()
       ent_values = ent.eval()
 
@@ -128,6 +147,8 @@ class StudentTTest(test.TestCase):
     sigma_bc = np.abs(sigma_v) * ones
     mu_bc = ones.T * mu_v
     df_bc = ones.T * df_v
+    if not stats:
+      return
     expected_entropy = stats.t.entropy(
         np.reshape(df_bc, [-1]),
         loc=np.reshape(mu_bc, [-1]),
@@ -144,7 +165,7 @@ class StudentTTest(test.TestCase):
       mu_v = 3.
       sigma_v = np.sqrt(10.)
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       n_val = 200000
@@ -166,11 +187,13 @@ class StudentTTest(test.TestCase):
       n = constant_op.constant(100)
 
       random_seed.set_random_seed(654321)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma, name="student_t1")
+      student = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, name="student_t1")
       samples1 = student.sample(n, seed=123456).eval()
 
       random_seed.set_random_seed(654321)
-      student2 = ds.StudentT(df=df, loc=mu, scale=sigma, name="student_t2")
+      student2 = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, name="student_t2")
       samples2 = student2.sample(n, seed=123456).eval()
 
       self.assertAllClose(samples1, samples2)
@@ -180,7 +203,7 @@ class StudentTTest(test.TestCase):
       df_v = [1e-1, 1e-5, 1e-10, 1e-20]
       df = constant_op.constant(df_v)
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=1., scale=1.)
+      student = student_t.StudentT(df=df, loc=1., scale=1.)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       n_val = 200000
@@ -198,7 +221,7 @@ class StudentTTest(test.TestCase):
       mu_v = [3., -3.]
       sigma_v = [np.sqrt(10.), np.sqrt(15.)]
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (200000, batch_size, 2))
@@ -222,6 +245,8 @@ class StudentTTest(test.TestCase):
   def _checkKLApprox(self, df, mu, sigma, samples):
     n = samples.size
     np.random.seed(137)
+    if not stats:
+      return
     sample_scipy = stats.t.rvs(df, loc=mu, scale=sigma, size=n)
     covg = 0.99
     r = stats.t.interval(covg, df, loc=mu, scale=sigma)
@@ -247,9 +272,9 @@ class StudentTTest(test.TestCase):
       self.assertEqual(student.prob(2.).get_shape(), (3,))
       self.assertEqual(student.sample(37, seed=123456).get_shape(), (37, 3,))
 
-    _check(ds.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
-    _check(ds.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
-    _check(ds.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
+    _check(student_t.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
+    _check(student_t.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
+    _check(student_t.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
 
   def testBroadcastingPdfArgs(self):
 
@@ -266,9 +291,9 @@ class StudentTTest(test.TestCase):
       xs = xs.T
       _assert_shape(student, xs, (3, 3))
 
-    _check(ds.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
-    _check(ds.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
-    _check(ds.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
+    _check(student_t.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
+    _check(student_t.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
+    _check(student_t.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
 
     def _check2d(student):
       _assert_shape(student, 2., (1, 3))
@@ -279,9 +304,9 @@ class StudentTTest(test.TestCase):
       xs = xs.T
       _assert_shape(student, xs, (3, 3))
 
-    _check2d(ds.StudentT(df=[[2., 3., 4.,]], loc=2., scale=1.))
-    _check2d(ds.StudentT(df=7., loc=[[2., 3., 4.,]], scale=1.))
-    _check2d(ds.StudentT(df=7., loc=3., scale=[[2., 3., 4.,]]))
+    _check2d(student_t.StudentT(df=[[2., 3., 4.,]], loc=2., scale=1.))
+    _check2d(student_t.StudentT(df=7., loc=[[2., 3., 4.,]], scale=1.))
+    _check2d(student_t.StudentT(df=7., loc=3., scale=[[2., 3., 4.,]]))
 
     def _check2d_rows(student):
       _assert_shape(student, 2., (3, 1))
@@ -292,22 +317,23 @@ class StudentTTest(test.TestCase):
       xs = xs.T  # (3,1)
       _assert_shape(student, xs, (3, 1))
 
-    _check2d_rows(ds.StudentT(df=[[2.], [3.], [4.]], loc=2., scale=1.))
-    _check2d_rows(ds.StudentT(df=7., loc=[[2.], [3.], [4.]], scale=1.))
-    _check2d_rows(ds.StudentT(df=7., loc=3., scale=[[2.], [3.], [4.]]))
+    _check2d_rows(student_t.StudentT(df=[[2.], [3.], [4.]], loc=2., scale=1.))
+    _check2d_rows(student_t.StudentT(df=7., loc=[[2.], [3.], [4.]], scale=1.))
+    _check2d_rows(student_t.StudentT(df=7., loc=3., scale=[[2.], [3.], [4.]]))
 
   def testMeanAllowNanStatsIsFalseWorksWhenAllBatchMembersAreDefined(self):
     with self.test_session():
       mu = [1., 3.3, 4.4]
-      student = ds.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
+      student = student_t.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
       mean = student.mean().eval()
       self.assertAllClose([1., 3.3, 4.4], mean)
 
   def testMeanAllowNanStatsIsFalseRaisesWhenBatchMemberIsUndefined(self):
     with self.test_session():
       mu = [1., 3.3, 4.4]
-      student = ds.StudentT(df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
-                            allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
+          allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.mean().eval()
 
@@ -315,8 +341,9 @@ class StudentTTest(test.TestCase):
     with self.test_session():
       mu = [-2, 0., 1., 3.3, 4.4]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
-                            allow_nan_stats=True)
+      student = student_t.StudentT(
+          df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
+          allow_nan_stats=True)
       mean = student.mean().eval()
       self.assertAllClose([np.nan, np.nan, 1., 3.3, 4.4], mean)
 
@@ -327,7 +354,8 @@ class StudentTTest(test.TestCase):
       df = [0.5, 1.5, 3., 5., 7.]
       mu = [-2, 0., 1., 3.3, 4.4]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma, allow_nan_stats=True)
+      student = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, allow_nan_stats=True)
       var = student.variance().eval()
       ## scipy uses inf for variance when the mean is undefined.  When mean is
       # undefined we say variance is undefined as well.  So test the first
@@ -336,6 +364,8 @@ class StudentTTest(test.TestCase):
       self.assertTrue(np.isnan(var[0]))
       var[0] = np.inf
 
+      if not stats:
+        return
       expected_var = [
           stats.t.var(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -348,9 +378,11 @@ class StudentTTest(test.TestCase):
       df = [1.5, 3., 5., 7.]
       mu = [0., 1., 3.3, 4.4]
       sigma = [4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       var = student.variance().eval()
 
+      if not stats:
+        return
       expected_var = [
           stats.t.var(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -359,13 +391,15 @@ class StudentTTest(test.TestCase):
   def testVarianceAllowNanStatsFalseRaisesForUndefinedBatchMembers(self):
     with self.test_session():
       # df <= 1 ==> variance not defined
-      student = ds.StudentT(df=1., loc=0., scale=1., allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=1., loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.variance().eval()
 
     with self.test_session():
       # df <= 1 ==> variance not defined
-      student = ds.StudentT(df=0.5, loc=0., scale=1., allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=0.5, loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.variance().eval()
 
@@ -375,11 +409,13 @@ class StudentTTest(test.TestCase):
       df = [3.5, 5., 3., 5., 7.]
       mu = [-2.2]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
       stddev = student.stddev().eval()
       mu *= len(df)
 
+      if not stats:
+        return
       expected_stddev = [
           stats.t.std(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -390,14 +426,14 @@ class StudentTTest(test.TestCase):
       df = [0.5, 1., 3]
       mu = [-1, 0., 1]
       sigma = [5., 4., 3.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
       mode = student.mode().eval()
       self.assertAllClose([-1., 0, 1], mode)
 
   def testPdfOfSample(self):
     with self.test_session() as sess:
-      student = ds.StudentT(df=3., loc=np.pi, scale=1.)
+      student = student_t.StudentT(df=3., loc=np.pi, scale=1.)
       num = 20000
       samples = student.sample(num, seed=123456)
       pdfs = student.prob(samples)
@@ -410,13 +446,15 @@ class StudentTTest(test.TestCase):
       self.assertEqual(mean.get_shape(), ())
       self.assertNear(np.pi, np.mean(sample_vals), err=0.02)
       self.assertNear(np.pi, mean_val, err=1e-6)
-      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
       # Verify integral over sample*pdf ~= 1.
       self._assertIntegral(sample_vals, pdf_vals, err=2e-3)
+      if not stats:
+        return
+      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
 
   def testPdfOfSampleMultiDims(self):
     with self.test_session() as sess:
-      student = ds.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
+      student = student_t.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
       self.assertAllEqual([], student.event_shape)
       self.assertAllEqual([], student.event_shape_tensor().eval())
       self.assertAllEqual([2, 2], student.batch_shape)
@@ -429,6 +467,12 @@ class StudentTTest(test.TestCase):
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
       self.assertNear(5., np.mean(sample_vals[:, 0, :]), err=.03)
       self.assertNear(6., np.mean(sample_vals[:, 1, :]), err=.03)
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertNear(
           stats.t.var(7., loc=0., scale=3.),  # loc d.n. effect var
           np.var(sample_vals[:, :, 0]),
@@ -437,10 +481,6 @@ class StudentTTest(test.TestCase):
           stats.t.var(11., loc=0., scale=3.),  # loc d.n. effect var
           np.var(sample_vals[:, :, 1]),
           err=.4)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1.5e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -454,8 +494,8 @@ class StudentTTest(test.TestCase):
 
   def testNegativeDofFails(self):
     with self.test_session():
-      student = ds.StudentT(df=[2, -5.], loc=0., scale=1.,
-                            validate_args=True, name="S")
+      student = student_t.StudentT(df=[2, -5.], loc=0., scale=1.,
+                                   validate_args=True, name="S")
       with self.assertRaisesOpError(r"Condition x > 0 did not hold"):
         student.mean().eval()
 
@@ -464,7 +504,8 @@ class StudentTTest(test.TestCase):
       df = constant_op.constant([-3.2, -4.6])
       mu = constant_op.constant([-4.2, 3.4])
       sigma = constant_op.constant([-6.4, -8.8])
-      student = ds.StudentTWithAbsDfSoftplusScale(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentTWithAbsDfSoftplusScale(
+          df=df, loc=mu, scale=sigma)
       self.assertAllClose(
           math_ops.floor(math_ops.abs(df)).eval(), student.df.eval())
       self.assertAllClose(mu.eval(), student.loc.eval())
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py
rename to tensorflow/python/kernel_tests/distributions/uniform_test.py
index c3c97b98f0d59fe3e7d632e40e61c3e4738a50b5..df99a0ed257da20179909eb44eacf7d44528dad2 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -18,15 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import uniform as uniform_lib
+
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform as uniform_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class UniformTest(test.TestCase):
@@ -126,7 +141,7 @@ class UniformTest(test.TestCase):
       b_v = np.array([1.0, 2.0, 3.0], dtype=np.float32)
       uniform = uniform_lib.Uniform(low=a_v, high=b_v, validate_args=True)
 
-      with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError,
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "x < y"):
         uniform.low.eval()
 
@@ -187,6 +202,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.mean().eval(), s_uniform.mean())
 
@@ -195,6 +212,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.variance().eval(), s_uniform.var())
 
@@ -203,6 +222,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.stddev().eval(), s_uniform.std())
 
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 8cd378825701e2aea2251a76095e89bb18eed1ba..2bd21fb01d1f187af9cf4cf9670d0fd3948a7df8 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-import tensorflow.python.ops.data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import compat
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 153d4ab66231ae391e5220ac928606b183dab46f..2d31ac85b02d688ab260f840cb62e38435764f23 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 
 
 def _random_pd_matrix(n, rng):
-  """Random postive definite matrix."""
+  """Random positive definite matrix."""
   temp = rng.randn(n, n)
   return temp.dot(temp.T)
 
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a90bc539bb6127993872651a99458daccdc71ad
--- /dev/null
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -0,0 +1,1321 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lookup ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class HashTableOpTest(test.TestCase):
+
+  def testHashTable(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testHashTableFindHighRank(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(
+          [["brain", "salad"], ["tank", "tarkus"]])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([[0, 1], [-1, -1]], result)
+
+  def testHashTableInitWithPythonArrays(self):
+    with self.test_session():
+      default_val = -1
+      keys = ["brain", "salad", "surgery"]
+      values = [0, 1, 2]
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(
+              keys, values, value_dtype=dtypes.int64), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testHashTableInitWithNumPyArrays(self):
+    with self.test_session():
+      default_val = -1
+      keys = np.array(["brain", "salad", "surgery"], dtype=np.str)
+      values = np.array([0, 1, 2], dtype=np.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testMultipleHashTables(self):
+    with self.test_session() as sess:
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+
+      table1 = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table2 = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table3 = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual(3, table1.size().eval())
+      self.assertAllEqual(3, table2.size().eval())
+      self.assertAllEqual(3, table3.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output1 = table1.lookup(input_string)
+      output2 = table2.lookup(input_string)
+      output3 = table3.lookup(input_string)
+
+      out1, out2, out3 = sess.run([output1, output2, output3])
+      self.assertAllEqual([0, 1, -1], out1)
+      self.assertAllEqual([0, 1, -1], out2)
+      self.assertAllEqual([0, 1, -1], out3)
+
+  def testHashTableWithTensorDefault(self):
+    with self.test_session():
+      default_val = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testHashTableWithSparseTensorInput(self):
+    with self.test_session() as sess:
+      default_val = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      sp_indices = [[0, 0], [0, 1], [1, 0]]
+      sp_shape = [2, 2]
+      input_tensor = sparse_tensor.SparseTensor(
+          constant_op.constant(sp_indices, dtypes.int64),
+          constant_op.constant(["brain", "salad", "tank"]),
+          constant_op.constant(sp_shape, dtypes.int64))
+      output = table.lookup(input_tensor)
+
+      out_indices, out_values, out_shape = sess.run(output)
+
+      self.assertAllEqual([0, 1, -1], out_values)
+      self.assertAllEqual(sp_indices, out_indices)
+      self.assertAllEqual(sp_shape, out_shape)
+
+  def testSignatureMismatch(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
+      with self.assertRaises(TypeError):
+        table.lookup(input_string)
+
+      with self.assertRaises(TypeError):
+        lookup_ops.HashTable(
+            lookup_ops.KeyValueTensorInitializer(keys, values), "UNK")
+
+  def testDTypes(self):
+    with self.test_session():
+      default_val = -1
+      with self.assertRaises(TypeError):
+        lookup_ops.HashTable(
+            lookup_ops.KeyValueTensorInitializer(["a"], [1], [dtypes.string],
+                                                 dtypes.int64), default_val)
+
+  def testNotInitialized(self):
+    with self.test_session():
+      default_val = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(
+              ["a"], [1], value_dtype=dtypes.int64), default_val)
+
+      input_string = constant_op.constant(["brain", "salad", "surgery"])
+      output = table.lookup(input_string)
+
+      with self.assertRaisesOpError("Table not initialized"):
+        output.eval()
+
+  def testInitializeTwice(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      with self.assertRaisesOpError("Table already initialized"):
+        table.init.run()
+
+  def testInitializationWithInvalidDimensions(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2, 3, 4], dtypes.int64)
+
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+  def testMultipleSessions(self):
+    # Start a server
+    server = server_lib.Server(
+        {
+            "local0": ["localhost:0"]
+        }, protocol="grpc", start=True)
+    # Create two sessions sharing the same state
+    session1 = session.Session(server.target)
+    session2 = session.Session(server.target)
+
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values),
+        default_val,
+        name="t1")
+
+    # Init the table in the first session.
+    with session1:
+      table.init.run()
+      self.assertAllEqual(3, table.size().eval())
+
+    # Init the table in the second session and verify that we do not get a
+    # "Table already initialized" error.
+    with session2:
+      table.init.run()
+      self.assertAllEqual(3, table.size().eval())
+
+
+class IndexTableFromFile(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def test_string_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int32_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab2.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int64_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab3.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_index_table_from_file_with_default_value(self):
+    default_value = -42
+    vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, default_value=default_value)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, default_value), ids.eval())
+
+  def test_index_table_from_file_with_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1000)
+      ids = table.lookup(
+          constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual(
+          (
+              1,  # From vocabulary file.
+              2,  # From vocabulary file.
+              867,  # 3 + fingerprint("tarkus") mod 300.
+              860),  # 3 + fingerprint("toccata") mod 300.
+          ids.eval())
+
+  def test_index_table_from_file_with_only_oov_buckets(self):
+    self.assertRaises(
+        ValueError, lookup_ops.index_table_from_file, vocabulary_file=None)
+
+  def test_index_table_from_file_with_vocab_size_too_small(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=2)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, -1, -1), ids.eval())
+      self.assertEqual(2, table.size().eval())
+
+  def test_index_table_from_file_with_vocab_size_too_large(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                              "Invalid vocab_size", table.init.run)
+
+  def test_index_table_from_file_with_vocab_size(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
+
+    self.assertRaises(
+        ValueError,
+        lookup_ops.index_table_from_file,
+        vocabulary_file=vocabulary_file,
+        vocab_size=0)
+
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=3)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, -1), ids.eval())
+      self.assertEqual(3, table.size().eval())
+
+  def test_index_table_from_file_with_invalid_hashers(self):
+    vocabulary_file = self._createVocabFile("invalid_hasher.txt")
+    with self.test_session():
+      with self.assertRaises(TypeError):
+        lookup_ops.index_table_from_file(
+            vocabulary_file=vocabulary_file,
+            vocab_size=3,
+            num_oov_buckets=1,
+            hasher_spec=1)
+
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          vocab_size=3,
+          num_oov_buckets=1,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      self.assertRaises(ValueError, table.lookup,
+                        constant_op.constant(["salad", "surgery", "tarkus"]))
+
+
+class KeyValueTensorInitializerTest(test.TestCase):
+
+  def test_string(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup_ops.KeyValueTensorInitializer(
+          ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+      table = lookup_ops.HashTable(init, default_value=-1)
+      table.init.run()
+
+  def test_int64(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                                  dtypes.int64, dtypes.int64)
+      table = lookup_ops.HashTable(init, default_value=-1)
+      table.init.run()
+
+  def test_int32(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                                  dtypes.int32, dtypes.int64)
+      table = lookup_ops.HashTable(init, default_value=-1)
+      with self.assertRaisesRegexp(
+          errors_impl.OpError, "No OpKernel was registered"):
+        table.init.run()
+
+
+class IndexTableFromTensor(test.TestCase):
+
+  def test_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int32_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int64_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_index_table_from_tensor_with_default_value(self):
+    default_value = -42
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=["brain", "salad", "surgery"],
+          default_value=default_value)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, default_value), ids.eval())
+
+  def test_index_table_from_tensor_missing_vocabulary_list(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError,
+                                   "vocabulary_list must be specified"):
+        lookup_ops.index_table_from_tensor(
+            vocabulary_list=None, num_oov_buckets=1)
+
+  def test_index_table_from_tensor_empty_vocabulary_list(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaisesRegexp(
+          errors_impl.OpError, "keys and values cannot be empty"):
+        lookup_ops.tables_initializer().run()
+
+  def test_index_table_from_tensor_with_invalid_hashers(self):
+    with self.test_session():
+      with self.assertRaises(TypeError):
+        lookup_ops.index_table_from_tensor(
+            vocabulary_list=["brain", "salad", "surgery"],
+            num_oov_buckets=1,
+            hasher_spec=1)
+
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=["brain", "salad", "surgery"],
+          num_oov_buckets=1,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      self.assertRaises(ValueError, table.lookup,
+                        constant_op.constant(["salad", "surgery", "tarkus"]))
+
+
+class IndexToStringTableFromFileTest(test.TestCase):
+
+  def _createVocabFile(self, basename):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["brain", "salad", "surgery"]) + "\n")
+    return vocabulary_file
+
+  def test_index_to_string_table(self):
+    vocabulary_file = self._createVocabFile("i2f_vocab1.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file)
+      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          features.eval())
+
+  def test_index_to_string_table_with_default_value(self):
+    default_value = b"NONE"
+    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, default_value=default_value)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", b"surgery", default_value),
+                          features.eval())
+
+  def test_index_to_string_table_with_vocab_size_too_small(self):
+    default_value = b"NONE"
+    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file,
+          vocab_size=2,
+          default_value=default_value)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", default_value, default_value),
+                          features.eval())
+
+  def test_index_to_string_table_with_vocab_size_too_large(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, features.eval)
+      init = lookup_ops.tables_initializer()
+      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                              "Invalid vocab_size", init.run)
+
+  def test_index_to_string_table_with_vocab_size(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=3)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
+
+
+class IndexToStringTableFromTensorTest(test.TestCase):
+
+  def test_index_to_string_table_from_tensor(self):
+    with self.test_session():
+      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list)
+
+      indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      features = table.lookup(indices)
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          features.eval())
+
+  def test_duplicate_entries(self):
+    with self.test_session():
+      vocabulary_list = constant_op.constant(["hello", "hello"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list)
+      indices = constant_op.constant([0, 1, 4], dtypes.int64)
+      features = table.lookup(indices)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
+
+  def test_index_to_string_with_default_value(self):
+    default_value = b"NONE"
+    with self.test_session():
+      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list, default_value=default_value)
+      indices = constant_op.constant([1, 2, 4], dtypes.int64)
+      features = table.lookup(indices)
+      self.assertRaises(errors_impl.OpError, features.eval)
+
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", b"surgery", default_value),
+                          features.eval())
+
+
+class InitializeTableFromFileOpTest(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def testInitializeStringTable(self):
+    vocabulary_file = self._createVocabFile("one_column_1.txt")
+
+    with self.test_session():
+      default_value = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file, dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      table.init.run()
+
+      output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInitializeInt64Table(self):
+    vocabulary_file = self._createVocabFile(
+        "one_column_int64.txt", values=("42", "1", "-1000"))
+
+    with self.test_session():
+      default_value = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file, dtypes.int64,
+              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      table.init.run()
+
+      output = table.lookup(
+          constant_op.constant((42, 1, 11), dtype=dtypes.int64))
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInitializeIndexTable(self):
+    vocabulary_file = self._createVocabFile("one_column_2.txt")
+
+    with self.test_session():
+      default_value = "UNK"
+      key_index = lookup_ops.TextFileIndex.LINE_NUMBER
+      value_index = lookup_ops.TextFileIndex.WHOLE_LINE
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                         key_index, dtypes.string, value_index),
+          default_value)
+      table.init.run()
+
+      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      output = table.lookup(input_values)
+
+      result = output.eval()
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
+
+  def testMultiColumn(self):
+    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
+
+    with self.test_session():
+      default_value = -1
+      key_index = 1
+      value_index = 2
+
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         key_index, dtypes.int64, value_index),
+          default_value)
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "surgery"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([1, 5, 6], result)
+
+  def testInvalidDataTypeInMultiColumn(self):
+    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
+
+    with self.test_session():
+      default_value = -1
+      key_index = 2
+      value_index = 1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         key_index, dtypes.int64, value_index),
+          default_value)
+      with self.assertRaisesOpError("is not a valid"):
+        table.init.run()
+
+  def testInvalidDataType(self):
+    vocabulary_file = self._createVocabFile("one_column_3.txt")
+
+    with self.test_session():
+      default_value = "UNK"
+      key_index = lookup_ops.TextFileIndex.WHOLE_LINE
+      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
+
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                           key_index, dtypes.string,
+                                           value_index), default_value)
+
+  def testInvalidIndex(self):
+    vocabulary_file = self._createVocabFile("one_column_4.txt")
+    with self.test_session():
+      default_value = -1
+      key_index = 1  # second column of the line
+      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         key_index, dtypes.int64, value_index),
+          default_value)
+
+      with self.assertRaisesOpError("Invalid number of columns"):
+        table.init.run()
+
+  def testInitializeSameTableWithMultipleNodes(self):
+    vocabulary_file = self._createVocabFile("one_column_5.txt")
+
+    with self.test_session() as sess:
+      shared_name = "shared-one-columm"
+      default_value = -1
+      table1 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         lookup_ops.TextFileIndex.WHOLE_LINE,
+                                         dtypes.int64,
+                                         lookup_ops.TextFileIndex.LINE_NUMBER),
+          default_value,
+          shared_name=shared_name)
+      table2 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         lookup_ops.TextFileIndex.WHOLE_LINE,
+                                         dtypes.int64,
+                                         lookup_ops.TextFileIndex.LINE_NUMBER),
+          default_value,
+          shared_name=shared_name)
+      table3 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         lookup_ops.TextFileIndex.WHOLE_LINE,
+                                         dtypes.int64,
+                                         lookup_ops.TextFileIndex.LINE_NUMBER),
+          default_value,
+          shared_name=shared_name)
+
+      lookup_ops.tables_initializer().run()
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+
+      output1 = table1.lookup(input_string)
+      output2 = table2.lookup(input_string)
+      output3 = table3.lookup(input_string)
+
+      out1, out2, out3 = sess.run([output1, output2, output3])
+      self.assertAllEqual([0, 1, -1], out1)
+      self.assertAllEqual([0, 1, -1], out2)
+      self.assertAllEqual([0, 1, -1], out3)
+
+  def testInitializeTableWithNoFilename(self):
+    with self.test_session():
+      default_value = -1
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(
+                "", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+            default_value)
+
+  def testInitializeWithVocabSize(self):
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      vocabulary_file1 = self._createVocabFile("one_column6.txt")
+      table1 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file1,
+              dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER,
+              vocab_size=vocab_size), default_value)
+
+      # Initialize from file.
+      table1.init.run()
+      self.assertEquals(vocab_size, table1.size().eval())
+
+      vocabulary_file2 = self._createVocabFile("one_column7.txt")
+      vocab_size = 5
+      table2 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file2,
+              dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER,
+              vocab_size=vocab_size), default_value)
+      with self.assertRaisesOpError("Invalid vocab_size"):
+        table2.init.run()
+
+      vocab_size = 1
+      vocabulary_file3 = self._createVocabFile("one_column3.txt")
+      table3 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file3,
+              dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER,
+              vocab_size=vocab_size), default_value)
+
+      # Smaller vocab size reads only vocab_size records.
+      table3.init.run()
+      self.assertEquals(vocab_size, table3.size().eval())
+
+  def testFeedVocabularyName(self):
+    vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
+
+    with self.test_session():
+      default_value = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              "old_file.txt", dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+
+      # Initialize with non existing file (old_file.txt) should fail.
+      # TODO(yleon): Update message, which might change per FileSystem.
+      with self.assertRaisesOpError("old_file.txt"):
+        table.init.run()
+
+      # Initialize the model feeding the vocabulary file.
+      filenames = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      table.init.run(feed_dict={filenames[0]: vocabulary_file})
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInvalidFilenames(self):
+    vocabulary_file = self._createVocabFile("filename_shape.txt")
+
+    with self.test_session():
+      default_value = -1
+
+      # Invalid data type
+      other_type = constant_op.constant(1)
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(
+                other_type, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+            default_value)
+
+      # Non-scalar filename
+      filenames = constant_op.constant([vocabulary_file, vocabulary_file])
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(
+                filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+            default_value)
+
+  def testIdToStringTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    with self.test_session():
+      default_value = "UNK"
+      vocab_size = 3
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileStringTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+
+      table.init.run()
+
+      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+
+      out = table.lookup(input_values)
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
+  def testStringToIdTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt")
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      out = table.lookup(input_string)
+      self.assertAllEqual([0, 1, 2, -1], out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
+  def testInt64ToIdTable(self):
+    vocab_file = self._createVocabFile(
+        "feat_to_id_3.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+          default_value)
+      table.init.run()
+
+      out = table.lookup(
+          constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
+      self.assertAllEqual((0, 1, 2, -1), out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
+
+class IdTableWithHashBucketsTest(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def testStringIdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size), default_value),
+          oov_buckets)
+
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      out = table.lookup(input_string)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testInt32IdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+              default_value),
+          oov_buckets,
+          key_dtype=dtypes.int32)
+
+      table.init.run()
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testInt64IdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+              default_value), oov_buckets)
+
+      table.init.run()
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testStringIdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      oov_buckets = 5
+
+      # Set a table that only uses hash buckets, for each input value returns
+      # an id calculated by fingerprint("input") mod oov_buckets.
+      table = lookup_ops.IdTableWithHashBuckets(None, oov_buckets)
+      table.init.run()
+
+      values = constant_op.constant(("brain", "salad", "surgery"))
+
+      out = table.lookup(values)
+      self.assertAllEqual(
+          [
+              3,  # fingerprint("brain") mod 5.
+              1,  # fingerprint("salad") mod 5.
+              4  # fingerprint("surgery") mod 5
+          ],
+          out.eval())
+      self.assertEquals(oov_buckets, table.size().eval())
+
+  def testInt32IdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      oov_buckets = 5
+
+      # Set a table that only uses hash buckets, for each input value returns
+      # an id calculated by fingerprint("input") mod oov_buckets.
+      table = lookup_ops.IdTableWithHashBuckets(
+          None, oov_buckets, key_dtype=dtypes.int32)
+      table.init.run()
+
+      input_string = constant_op.constant([42, 1, -1000], dtype=dtypes.int32)
+
+      out = table.lookup(input_string)
+      self.assertAllEqual(
+          [
+              1,  # fingerprint("42") mod 5.
+              4,  # fingerprint("1") mod 5.
+              2  # fingerprint("-1000") mod 5
+          ],
+          out.eval())
+      self.assertEquals(oov_buckets, table.size().eval())
+
+  def testFloat64IdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+        lookup_ops.IdTableWithHashBuckets(
+            None, num_oov_buckets=5, key_dtype=dtypes.float64)
+
+  def testBoolIdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+        lookup_ops.IdTableWithHashBuckets(
+            None, num_oov_buckets=5, key_dtype=dtypes.bool)
+
+  def testIdTableWithHashBucketsWithMultipleInitializers(self):
+    vocab_file = self._createVocabFile("feat_to_id_4.txt")
+    with self.test_session() as sess:
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 3
+
+      vocab_table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+      table1 = lookup_ops.IdTableWithHashBuckets(
+          vocab_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.FastHashSpec,
+          name="table1")
+
+      table2 = lookup_ops.IdTableWithHashBuckets(
+          vocab_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.StrongHashSpec((1, 2)),
+          name="table2")
+
+      lookup_ops.tables_initializer().run()
+
+      input_string = constant_op.constant(
+          ["fruit", "brain", "salad", "surgery", "UNK"])
+
+      out1 = table1.lookup(input_string)
+      out2 = table2.lookup(input_string)
+
+      out1, out2 = sess.run([out1, out2])
+      self.assertAllEqual([5, 0, 1, 2, 5], out1)
+      self.assertAllEqual([5, 0, 1, 2, 3], out2)
+      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+      test_util.assert_ops_in_graph({
+          "table1_Lookup/hash_bucket": "StringToHashBucketFast",
+          "table2_Lookup/hash_bucket": "StringToHashBucketStrong",
+      }, sess.graph)
+
+  def testIdTableWithHashBucketsInitializationAcrossSessions(self):
+    vocab_file = self._createVocabFile("feat_to_id_5.txt")
+    shared_name = "across-sessions"
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table1 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size),
+              default_value,
+              shared_name=shared_name), oov_buckets)
+
+      table1.init.run()
+
+      input_string_1 = constant_op.constant(
+          ["brain", "salad", "surgery", "UNK"])
+
+      out1 = table1.lookup(input_string_1)
+
+      self.assertAllEqual([0, 1, 2, 3], out1.eval())
+      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+
+      # Underlying lookup table already initialized in previous session.
+      # No need to call table2.init.run()
+      table2 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size),
+              default_value,
+              shared_name=shared_name), oov_buckets)
+
+      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+
+      out2 = table2.lookup(input_string_2)
+
+      self.assertAllEqual([3, 1, 3], out2.eval())
+      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+
+  def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
+    vocab_file = self._createVocabFile("feat_to_id_6.txt")
+    with self.test_session() as sess:
+      default_value1 = -1
+      vocab_size = 3
+      oov_buckets = 0
+      table1 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size), default_value1),
+          oov_buckets)
+
+      default_value2 = -2
+      table2 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size), default_value2),
+          oov_buckets)
+
+      lookup_ops.tables_initializer().run()
+
+      input_string_1 = constant_op.constant(
+          ["brain", "salad", "surgery", "UNK"])
+      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+
+      out1 = table1.lookup(input_string_1)
+      out2 = table2.lookup(input_string_2)
+
+      out1, out2 = sess.run([out1, out2])
+      self.assertAllEqual([0, 1, 2, -1], out1)
+      self.assertAllEqual([-2, 1, -2], out2)
+      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+
+  def testSparseTensor(self):
+    vocab_file = self._createVocabFile("feat_to_id_7.txt")
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                               dtypes.string),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3),
+              -1), 1)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt32SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.KeyValueTensorInitializer(
+                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+          1,
+          key_dtype=dtypes.int32)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt64SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.KeyValueTensorInitializer(
+                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+          1,
+          key_dtype=dtypes.int64)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testIdTableWithHashBucketsWithInvalidHashers(self):
+    vocab_file = self._createVocabFile("feat_to_id_4.txt")
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      lookup_table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+
+      with self.assertRaises(TypeError):
+        lookup_ops.IdTableWithHashBuckets(
+            lookup_table, oov_buckets, hasher_spec=1)
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      with self.assertRaises(ValueError):
+        table.lookup(input_string)
+
+      with self.assertRaises(ValueError):
+        table = lookup_ops.IdTableWithHashBuckets(
+            lookup_table,
+            oov_buckets,
+            hasher_spec=lookup_ops.StrongHashSpec([]))
+
+      with self.assertRaises(ValueError):
+        table = lookup_ops.IdTableWithHashBuckets(
+            lookup_table,
+            oov_buckets,
+            hasher_spec=lookup_ops.StrongHashSpec([1, 2, 3]))
+
+      with self.assertRaises(TypeError):
+        table = lookup_ops.IdTableWithHashBuckets(
+            lookup_table,
+            oov_buckets,
+            hasher_spec=lookup_ops.StrongHashSpec([None, 2]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 263f90c4f1f2cb16228e38f516e7818e4b4ed1fc..c4418dfd43a66e3b1b30fa0ddd07e8ea7933e727 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -32,7 +32,7 @@ class InverseOpTest(test.TestCase):
     for np_type in [np.float32, np.float64]:
       for adjoint in False, True:
         y = x.astype(np_type)
-        with self.test_session():
+        with self.test_session(use_gpu=True):
           # Verify that x^{-1} * x == Identity matrix.
           inv = linalg_ops.matrix_inverse(y, adjoint=adjoint)
           tf_ans = math_ops.matmul(inv, y, adjoint_b=adjoint)
@@ -86,8 +86,8 @@ class InverseOpTest(test.TestCase):
     with self.test_session():
       with self.assertRaisesOpError("Input is not invertible."):
         # All rows of the matrix below add to zero.
-        tensor3 = constant_op.constant(
-            [[1., 0., -1.], [-1., 1., 0.], [0., -1., 1.]])
+        tensor3 = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
+                                        [0., -1., 1.]])
         linalg_ops.matrix_inverse(tensor3).eval()
 
   def testEmpty(self):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index c7fc7dd5826e3a5c27b3e1e5546b8645582a7d3f..e098cf3ff9ca88bfee7746b2916e8dd947f664f2 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -186,9 +186,9 @@ class PyOpTest(test.TestCase):
 
       def bad():
         # Non-string python objects aren't supported.
-        return dtypes.float32
+        return {"foo": dtypes.float32}
 
-      z, = script_ops.py_func(bad, [], [dtypes.float64])
+      z, = script_ops.py_func(bad, [], [dtypes.int64])
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported object type"):
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index dbc52acb79f1f0875ae58fbc2bb1c203cf39acfa..97d61d52af5ccbf51ceb3ab6934ebe14c1165063 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -147,6 +147,14 @@ class ShapeOpsTest(test.TestCase):
     self._testAll(np.random.randn(2, 3, 5, 7, 11))
     self._testAll(np.random.randn(2, 3, 5, 7, 11, 13))
 
+  def testBool(self):
+    self._testAll(np.random.choice((False, True), size=(2,)))
+    self._testAll(np.random.choice((False, True), size=(2, 3)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5, 7)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5, 7, 11)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5, 7, 11, 13)))
+
   # Disabled because it takes too long to run, but manually verified
   # as passing at time of writing.
   def _test64BitOutput(self):
@@ -197,12 +205,38 @@ class ShapeOpsTest(test.TestCase):
     self._compareExpandDimsAll(np.zeros([2, 3, 5]), -3)
     self._compareExpandDimsAll(np.zeros([2, 3, 5]), -4)
 
+  def testExpandDimsBool(self):
+    choice = lambda s: np.random.choice((False, True), size=s)
+    self._compareExpandDimsAll(choice([2]), 0)
+    self._compareExpandDimsAll(choice([2]), 1)
+    self._compareExpandDimsAll(choice([2]), -1)
+
+    self._compareExpandDimsAll(choice([2, 3]), 0)
+    self._compareExpandDimsAll(choice([2, 3]), 1)
+    self._compareExpandDimsAll(choice([2, 3]), 2)
+    self._compareExpandDimsAll(choice([2, 3]), -1)
+    self._compareExpandDimsAll(choice([2, 3]), -2)
+
+    self._compareExpandDimsAll(choice([2, 3, 5]), 0)
+    self._compareExpandDimsAll(choice([2, 3, 5]), 1)
+    self._compareExpandDimsAll(choice([2, 3, 5]), 2)
+    self._compareExpandDimsAll(choice([2, 3, 5]), 3)
+
+    self._compareExpandDimsAll(choice([2, 3, 5]), -1)
+    self._compareExpandDimsAll(choice([2, 3, 5]), -2)
+    self._compareExpandDimsAll(choice([2, 3, 5]), -3)
+    self._compareExpandDimsAll(choice([2, 3, 5]), -4)
+
   def testExpandDimsErrors(self):
     with self.test_session():
       self.assertRaises(ValueError, array_ops.expand_dims,
                         np.zeros([2, 3, 5]), -5)
+      self.assertRaises(ValueError, array_ops.expand_dims,
+                        [False, True, True], -5)
       self.assertRaises(ValueError, array_ops.expand_dims,
                         np.zeros([2, 3, 5]), 4)
+      self.assertRaises(ValueError, array_ops.expand_dims,
+                        [False, True, True], 4)
 
   def testExpandDimsGradient(self):
     with self.test_session():
@@ -220,6 +254,10 @@ class ShapeOpsTest(test.TestCase):
       self.assertAllEqual([7], array_ops.expand_dims(inp, 0).eval())
       self.assertAllEqual([7], array_ops.expand_dims(inp, -1).eval())
 
+      inp = constant_op.constant(True)
+      self.assertAllEqual([True], array_ops.expand_dims(inp, 0).eval())
+      self.assertAllEqual([True], array_ops.expand_dims(inp, -1).eval())
+
   def _compareSqueeze(self, x, squeeze_dims, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       if squeeze_dims:
@@ -250,6 +288,18 @@ class ShapeOpsTest(test.TestCase):
     # Squeeze on both ends.
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]))
 
+  def testSqueezeBool(self):
+    choice = lambda s: np.random.choice((False, True), size=s)
+    # Nothing to squeeze.
+    self._compareSqueezeAll(choice([2]))
+    self._compareSqueezeAll(choice([2, 3]))
+
+    # Squeeze the middle element away.
+    self._compareSqueezeAll(choice([2, 1, 2]))
+
+    # Squeeze on both ends.
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]))
+
   def testSqueezeSpecificDimension(self):
     # Positive squeeze dim index.
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]), [0])
@@ -261,6 +311,18 @@ class ShapeOpsTest(test.TestCase):
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]), [-3, -5])
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]), [-3, -5, -1])
 
+  def testSqueezeSpecificDimensionBool(self):
+    choice = lambda s: np.random.choice((False, True), size=s)
+    # Positive squeeze dim index.
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [0])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [2, 4])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [0, 4, 2])
+
+    # Negative squeeze dim index.
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [-1])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [-3, -5])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [-3, -5, -1])
+
   def testSqueezeAllOnes(self):
     # Numpy squeezes a 1 element tensor into a zero dimensional tensor.
     # Verify that we do the same.
@@ -271,6 +333,16 @@ class ShapeOpsTest(test.TestCase):
         tf_ans = tensor.eval()
         self.assertEqual(np.shape(1), tf_ans.shape)
 
+  def testSqueezeAllOnesBool(self):
+    # Numpy squeezes a 1 element tensor into a zero dimensional tensor.
+    # Verify that we do the same.
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        tensor = array_ops.squeeze([[[False]]], [])
+        self.assertEqual(np.shape(1), tensor.get_shape())
+        tf_ans = tensor.eval()
+        self.assertEqual(np.shape(1), tf_ans.shape)
+
   def testSqueezeOnlyOnes(self):
     for use_gpu in [False, True]:
       with self.test_session(use_gpu=use_gpu):
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index f70f60c0f5ef6d0d238412ab8c0bcbba577b180c..b8e7c50a378317636fe184abc411483c96c6ebbf 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -85,6 +86,45 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  def testGradGrad(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.softplus(x, name="softplus")
+      (grad,) = gradients_impl.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], grad, [2, 5], x_init_value=x_init)
+    print("softplus (float) gradient of gradient err = ", err)
+    self.assertLess(err, 5e-5)
+
+  def testGradGradGrad(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.softplus(x, name="softplus")
+      (grad,) = gradients_impl.gradients(y, x)
+      (grad_grad,) = gradients_impl.gradients(grad, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], grad_grad, [2, 5], x_init_value=x_init)
+    print("softplus (float) third-order gradient err = ", err)
+    self.assertLess(err, 5e-5)
+
+  def testWarnInts(self):
+    # Running the op triggers address sanitizer errors, so we just make it
+    nn_ops.softplus(constant_op.constant(7))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
index 5fd5253c092d71f66d31d1411d306004a5ff1666..371f86ff151f35764e5f976aba8301d250e199a9 100644
--- a/tensorflow/python/kernel_tests/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -65,6 +65,12 @@ class SoftsignTest(test.TestCase):
     print("softsign (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  def testWarnInts(self):
+    # NOTE(irving): Actually I don't know how to intercept the warning, but
+    # let's make sure it runs.  I promised I've looked, and there was a warning.
+    with self.test_session():
+      nn_ops.softsign(constant_op.constant(7)).eval()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index 874dcbabf10911fff5dfa1257b5310b2b60494a9..555c16194e10105eb7c28344f688ad643d3aae4b 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -88,6 +88,7 @@ class SparseAddTest(test.TestCase):
       for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
         for sp_b in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
+          self.assertAllEqual((3, 3), sp_sum.get_shape())
 
           sum_out = sess.run(sp_sum)
 
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d09badf27e621ec244730eb6c1f6b637546219f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -0,0 +1,398 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sparse_cross_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+
+
+class SparseCrossOpTest(test.TestCase):
+
+  def test_simple(self):
+    """Tests a simple scenario."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1'],
+                             ['batch2-FC1-F1', 'batch2-FC1-F2']]),
+        self._sparse_tensor([['batch1-FC2-F1'],
+                             ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    ])
+    expected_out = self._sparse_tensor([['batch1-FC1-F1_X_batch1-FC2-F1'], [
+        'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+        'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_dense(self):
+    """Tests only dense inputs."""
+    op = sparse_ops._sparse_cross([
+        constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                              ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                             dtypes.string),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1', 'batch1-FC1-F2_X_batch1-FC2-F2'
+    ], [
+        'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+        'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_integer_mixed_string_sparse(self):
+    """Tests mixed type."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([[11], [333, 55555]]),
+        self._sparse_tensor([['batch1-FC2-F1'],
+                             ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    ])
+    expected_out = self._sparse_tensor([['11_X_batch1-FC2-F1'], [
+        '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2', '55555_X_batch2-FC2-F1',
+        '55555_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_integer_mixed_string_dense(self):
+    """Tests mixed dense inputs."""
+    op = sparse_ops._sparse_cross([
+        constant_op.constant([[11, 333], [55555, 999999]], dtypes.int64),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor([[
+        '11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2', '333_X_batch1-FC2-F1',
+        '333_X_batch1-FC2-F2'
+    ], [
+        '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2',
+        '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_sparse_cross_dense(self):
+    """Tests sparse and dense inputs."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1'],
+                             ['batch2-FC1-F1', 'batch2-FC1-F2']]),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor(
+        [['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2'], [
+            'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+            'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_integer_sparse_input(self):
+    """Tests mixed type sparse and dense inputs."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([[11], [333, 5555]]),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor(
+        [['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2'], [
+            '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2',
+            '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2'
+        ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_permutation_3x3x3(self):
+    """Tests 3x3x3 permutation."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor(
+            [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
+        self._sparse_tensor(
+            [['batch1-FC2-F1', 'batch1-FC2-F2', 'batch1-FC2-F3']]),
+        self._sparse_tensor(
+            [['batch1-FC3-F1', 'batch1-FC3-F2', 'batch1-FC3-F3']])
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_permutation_3x1x2(self):
+    """Tests 3x1x2 permutation."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor(
+            [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
+        self._sparse_tensor([['batch1-FC2-F1']]),
+        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_large_batch(self):
+    """Tests with large batch size to force multithreading."""
+    batch_size = 5000
+    col1 = []
+    col2 = []
+    col3 = []
+    for b in range(batch_size):
+      col1.append(
+          ['batch%d-FC1-F1' % b, 'batch%d-FC1-F2' % b, 'batch%d-FC1-F3' % b])
+      col2.append(['batch%d-FC2-F1' % b])
+      col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])
+
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor(col1), self._sparse_tensor(col2),
+        self._sparse_tensor(col3)
+    ])
+
+    col_out = []
+    for b in range(batch_size):
+      col_out.append([
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b)
+      ])
+
+    expected_out = self._sparse_tensor(col_out)
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_one_column_empty(self):
+    """Tests when one column is empty.
+
+    The crossed tensor should be empty.
+    """
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']]),
+        self._sparse_tensor([], 1),
+        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    ])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_empty(sess.run(op))
+
+  def test_some_columns_empty(self):
+    """Tests when more than one columns are empty.
+
+    Cross for the corresponding batch should be empty.
+    """
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2),
+        self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2),
+        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]], 2)
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_all_columns_empty(self):
+    """Tests when all columns are empty.
+
+    The crossed tensor should be empty.
+    """
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([]), self._sparse_tensor([]),
+        self._sparse_tensor([])
+    ])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_empty(sess.run(op))
+
+  def test_hashed_zero_bucket_no_hash_key(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ])
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[1971693436396284976]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_hashed_zero_bucket(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ],
+        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[4847552627144134031]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  # TODO(sibyl-Aix6ihai): Add benchmark to compare Hashed vs Non-hashed.
+  def test_hashed_no_hash_key(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ],
+        num_buckets=100)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[83]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_hashed_output(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ],
+        num_buckets=100,
+        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[31]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_hashed__has_no_collision(self):
+    """Tests that fingerprint concatenation has no collisions."""
+    # Although the last 10 bits of 359 and 1024+359 are identical.
+    # As a result, all the crosses shouldn't collide.
+    t1 = constant_op.constant([[359], [359 + 1024]])
+    t2 = constant_op.constant([list(range(10)), list(range(10))])
+    cross = sparse_ops._sparse_cross_hashed(
+        [t2, t1],
+        num_buckets=1024,
+        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
+    with session.Session():
+      values = cross_dense.eval()
+      self.assertTrue(numpy.not_equal(values[0], values[1]).all())
+
+  def test_hashed_3x1x2(self):
+    """Tests 3x1x2 permutation with hashed output."""
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor(
+                [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+        ],
+        num_buckets=1000)
+    with self.test_session() as sess:
+      out = sess.run(op)
+      self.assertEqual(6, len(out.values))
+      self.assertAllEqual([[0, i] for i in range(6)], out.indices)
+      self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
+      all_values_are_different = len(out.values) == len(set(out.values))
+      self.assertTrue(all_values_are_different)
+
+  def _assert_sparse_tensor_empty(self, sp):
+    self.assertEquals(0, sp.indices.size)
+    self.assertEquals(0, sp.values.size)
+    # TODO(zakaria): check if we can ignore the first dim of the shape.
+    self.assertEquals(0, sp.dense_shape[1])
+
+  def _assert_sparse_tensor_equals(self, sp1, sp2):
+    self.assertAllEqual(sp1.indices.eval(), sp2.indices)
+    self.assertAllEqual(sp1.values.eval(), sp2.values)
+    self.assertAllEqual(sp1.dense_shape.eval(), sp2.dense_shape)
+
+  def _sparse_tensor(self, data, batch_size=-1):
+    """Generates a SparseTensor.
+
+    Args:
+      data: Should be a list of list of strings or int64. Each item of the outer
+          list represents a batch. Each item of the batch is a feature of a
+          specific feature column.
+      batch_size: optional batch size, especially for cases when data has no
+          entry for some batches.
+
+    Returns:
+     A SparseTensor.
+    """
+    indices = []
+    values = []
+    max_col_count = 0
+    for batch, batch_ix in zip(data, range(len(data))):
+      for column, column_ix in zip(batch, range(len(batch))):
+        indices.append([batch_ix, column_ix])
+        values.append(column)
+        max_col_count = max(max_col_count, column_ix + 1)
+    shape = [batch_size if batch_size != -1 else len(data), max_col_count]
+    value_type = (dtypes.string if not values or isinstance(values[0], str) else
+                  dtypes.int64)
+    return sparse_tensor.SparseTensor(
+        constant_op.constant(indices, dtypes.int64, [len(indices), 2]),
+        constant_op.constant(values, value_type, [len(indices)]),
+        constant_op.constant(shape, dtypes.int64))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 06d5cbaf2d0f88e63bbf2a693ec9afab63ba3399..bad11a29df0a63033fd169b91e5493319e9181c0 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -328,6 +328,12 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
+  def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
+    sp_input = self._SparseTensor_2x5x6()
+    new_shape = np.array([3, 6, 7], dtype=np.int64)
+    sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
+    self.assertAllEqual([3, 6, 7], sp_output.get_shape())
+
   def testBasic(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
@@ -397,14 +403,21 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
-  def testInvalidDimensionSize(self):
+  def testInvalidDimensionSizeStatic(self):
+    sp_input = self._SparseTensor_2x5x6()
+    new_shape = np.array([3, 7, 5], dtype=np.int64)
+
+    with self.assertRaisesRegexp(ValueError, "should have dimension sizes"):
+      sparse_ops.sparse_reset_shape(sp_input, new_shape)
+
+  def testInvalidDimensionSizeDynamic(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
-      new_shape = np.array([3, 7, 5], dtype=np.int64)
+      new_shape = array_ops.placeholder(dtype=dtypes.int32)
       out = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
       with self.assertRaisesOpError("x <= y did not hold element-wise"):
-        sess.run(out)
+        sess.run(out, feed_dict={new_shape: [3, 7, 5]})
 
   def testInvalidDimensionSizeInputUnavailableInGraphConstruction(self):
     sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
index 5136cdadead4d6dfd6961f4c128acb5de18963b8..18335d665af833fb7d9fef0b517b2c4efc4a005e 100644
--- a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
@@ -48,6 +48,13 @@ class SparseReorderTest(test.TestCase):
     shape = np.array([5, 6]).astype(np.int64)
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
+  def testStaticShapeInfoPreserved(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_5x6(np.arange(6)))
+    self.assertAllEqual((5, 6), sp_input.get_shape())
+    sp_output = sparse_ops.sparse_reorder(sp_input)
+    self.assertAllEqual((5, 6), sp_output.get_shape())
+
   def testAlreadyInOrder(self):
     with self.test_session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6(np.arange(6))
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index 1bb05aa3b2aab1141e27ad486ad436b4c6bc2dd3..e87fa0c94c4cf3346c0127dd17b037cabb3cbb56 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -50,6 +50,13 @@ class SparseReshapeTest(test.TestCase):
     shape = np.array([2, 3, 4])
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
+  def testStaticShapeInfoPreserved(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_5x6())
+    self.assertAllEqual((5, 6), sp_input.get_shape())
+    sp_output = sparse_ops.sparse_reshape(sp_input, shape=(1, 5, 2, 3))
+    self.assertAllEqual((1, 5, 2, 3), sp_output.get_shape())
+
   def testSameShape(self):
     with self.test_session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6()
@@ -71,6 +78,18 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  def testWorksWellWithTfShape(self):
+    with self.test_session(use_gpu=False) as sess:
+      sp_input = self._SparseTensorPlaceholder()
+      input_val = self._SparseTensorValue_5x6()
+      shape = array_ops.shape(sp_input)  # tf.shape generates int32 output
+      sp_output = sparse_ops.sparse_reshape(sp_input, shape)
+
+      output_val = sess.run(sp_output, {sp_input: input_val})
+      self.assertAllEqual(output_val.indices, input_val.indices)
+      self.assertAllEqual(output_val.values, input_val.values)
+      self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
+
   def testFeedSameShapeWithInferredDim(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -180,6 +199,12 @@ class SparseReshapeTest(test.TestCase):
       with self.assertRaisesOpError("only one output shape size may be -1"):
         sess.run(sp_output, {sp_input: input_val})
 
+  def testProvideStaticallyMismatchedSizes(self):
+    input_val = self._SparseTensorValue_5x6()
+    sp_input = sparse_tensor.SparseTensor.from_value(input_val)
+    with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
+      sparse_ops.sparse_reshape(sp_input, [4, 7])
+
   def testFeedMismatchedSizes(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index 809917518600411fc636be6267212cc977ca9327..a0bd178e247019470a907275cdf8d42d162be38e 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -161,6 +162,46 @@ class SparseTensorDenseMatMulTest(test.TestCase):
         sparse_ops.sparse_tensor_dense_matmul(
             sparse_t, dense_t, adjoint_a=True).eval()
 
+  def testInvalidIndicesForSparseTensorDenseMatmulOnGPU(self):
+    # Note: use_gpu=False because nice errors are only returned from CPU kerne
+    if not test.is_gpu_available():
+      return
+    with self.test_session(use_gpu=True):
+      indices = np.array([[1, 10]]).astype(np.int64)
+      values = np.array([10]).astype(np.float32)
+      shape = [3, 2]
+      sparse_t = sparse_tensor.SparseTensor(indices, values, shape)
+
+      # Test multiplying by both a small and large dense matrix, to hit
+      # both cases in the kernel.
+      dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
+      expected_t = np.array([[0] * 5, [np.nan] * 5, [0] * 5], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t).eval())
+      dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
+      expected_t = np.array(
+          [[0] * 500, [np.nan] * 500, [0] * 500], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t).eval())
+
+      # Repeat with adjoint_a, now the error is that the sparse index
+      # is OOO w.r.t. the output.  The GPU kernel can't do much here,
+      # so it just doesn't accumulate.
+
+      dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
+      expected_t = np.array([[0] * 5, [0] * 5], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t, adjoint_a=True).eval())
+
+      dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
+      expected_t = np.array([[0] * 500, [0] * 500], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t, adjoint_a=True).eval())
+
   # Tests setting one dimension to be a high value.
   def _testLarge(self, np_dtype):
     r1 = np.random.randint(6000, 20000)
@@ -175,9 +216,12 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
       y = _maybe_complex(np.random.randn(k, n).astype(np_dtype))
 
-      self._testMatmul(x, y)
+      self._testMatmul(x, y, adjoint_a=False, adjoint_b=False)
+      self._testMatmul(x.transpose(), y, adjoint_a=True, adjoint_b=False)
+      self._testMatmul(x, y.transpose(), adjoint_a=False, adjoint_b=True)
+      self._testMatmul(
+          x.transpose(), y.transpose(), adjoint_a=True, adjoint_b=True)
 
-  def testLarge(self):
     np.random.seed(127)  # Repeatable results
     self._testLarge(np.float32)
     self._testLarge(np.float64)
@@ -221,7 +265,9 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(x, y, adjoint_a,
         lambda t, _: t < iterations,
         body, (t0, v0),
         parallel_iterations=1,
-        back_prop=False)
+        back_prop=False,
+        shape_invariants=(tensor_shape.TensorShape(()),
+                          tensor_shape.TensorShape(None)))
     return [final]
 
   return _timeit
@@ -246,7 +292,9 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(x_ind, x_val, x_shape,
         lambda t, _: t < iterations,
         body, (t0, v0),
         parallel_iterations=1,
-        back_prop=False)
+        back_prop=False,
+        shape_invariants=(tensor_shape.TensorShape(()),
+                          tensor_shape.TensorShape(None)))
     return [final]
 
   return _timeit
@@ -291,7 +339,7 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
   if skip_dense:
     delta_dense = float("nan")
   else:
-    with session.Session("", config=config, graph=ops.Graph()) as sess:
+    with session.Session(config=config, graph=ops.Graph()) as sess:
       if not use_gpu:
         with ops.device("/cpu:0"):
           x_t = constant_op.constant(x)
@@ -299,12 +347,12 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
           ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
               x_t, y_t, adjoint_a, adjoint_b)
       else:
-        x_t = constant_op.constant(x)
-        y_t = constant_op.constant(y)
-        ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(x_t, y_t,
-                                                                      adjoint_a,
-                                                                      adjoint_b)
-      delta_dense = _timer(sess, ops_fn, 1000)
+        with ops.device("/gpu:0"):
+          x_t = constant_op.constant(x)
+          y_t = constant_op.constant(y)
+          ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
+              x_t, y_t, adjoint_a, adjoint_b)
+      delta_dense = _timer(sess, ops_fn, 200)
 
   # Using sparse_tensor_dense_matmul.
   with session.Session("", config=config, graph=ops.Graph()) as sess:
@@ -317,13 +365,14 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
         ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
             x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
     else:
-      x_ind = constant_op.constant(np.vstack(np.where(x)).astype(np.int64).T)
-      x_val = constant_op.constant(x[np.where(x)])
-      x_shape = constant_op.constant(np.array(x.shape).astype(np.int64))
-      y_t = constant_op.constant(y)
-      ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
-          x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
-    delta_sparse = _timer(sess, ops_fn, 1000)
+      with ops.device("/gpu:0"):
+        x_ind = constant_op.constant(np.vstack(np.where(x)).astype(np.int64).T)
+        x_val = constant_op.constant(x[np.where(x)])
+        x_shape = constant_op.constant(np.array(x.shape).astype(np.int64))
+        y_t = constant_op.constant(y)
+        ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
+            x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
+    delta_sparse = _timer(sess, ops_fn, 200)
 
   print("%g \t %d \t %s \t %d \t %d \t %g \t %g \t %g" %
         (1 - thresh, n, use_gpu, m, k, delta_dense, delta_sparse,
@@ -340,7 +389,7 @@ def main(_):
         "\t dt(sparse)/dt(dense)")
 
   for thresh in (0.99, 0.8, 0.5, 0.2):
-    for n in (1, 10, 25):
+    for n in (50, 100):
       for use_gpu in (True, False):
         for m in (100, 1000):
           for k in (100, 1000):
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 38093ab6d63921228d48a754402af9824f5b02a4..3dcafd2496565b8c0f9c42829e12f051185fa345 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@@ -40,6 +41,48 @@ class SplitOpTest(test.TestCase):
       data -= 1j * data
     return data
 
+  def testShapeInference(self):
+    model_input = array_ops.placeholder(dtypes.float32, shape=(1, 10))
+
+    # check that we fail during static shape inference if sizes are known
+    with self.assertRaises(ValueError):
+      # pylint: disable=expression-not-assigned
+      array_ops.split(model_input, [4], axis=1)[0]
+      # pylint: enable=expression-not-assigned
+
+    model_input = array_ops.placeholder(dtypes.float32)
+    inp = np.zeros((1, 10))
+    # check that we still fail at runtime if the shapes were unknown
+    with self.test_session(use_gpu=False) as sess:
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        sess.run(array_ops.split(model_input, [4]), {model_input: inp})
+
+    # test that we can pass a scalar Tensor as num_splits
+    with self.test_session(use_gpu=False) as sess:
+      result = sess.run(
+          array_ops.split(
+              array_ops.ones([4, 4]),
+              num_or_size_splits=array_ops.ones([2, 2]).get_shape()[1],
+              axis=0))
+
+    self.assertEqual(result[0].shape, (2, 4))
+    self.assertEqual(result[1].shape, (2, 4))
+
+    # test that none split dimensions remain, even if we don't know how
+    # the split_dim will be split, but we do know the axis
+    result = array_ops.split(
+        array_ops.ones([5, 2]), array_ops.constant([2, 1, 2]) * 1, axis=0)
+
+    self.assertEqual(result[0].shape[1], 2)
+    self.assertEqual(result[1].shape[1], 2)
+    self.assertEqual(result[2].shape[1], 2)
+
+    model_input2 = array_ops.placeholder(dtypes.float32, shape=[None, 2])
+    result = array_ops.split(model_input2, [2, 2], axis=0)[0]
+
+    with self.test_session(use_gpu=False) as sess:
+      sess.run(result, feed_dict={model_input2: np.ones([4, 2])})
+
   def testExplicitNum(self):
     size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
 
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 0fec42e1dba4fbe96fb2c2bde7767071adf6c1ab..41fe29e006f46b2d91f804e005936520041d8d72 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -1128,7 +1128,7 @@ class TensorArrayTest(test.TestCase):
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=True)
       self.assertEqual(0, ta.size().eval())
       # Don't actually perform the pack.  This stores the static shape.
-      ta.unstack(array_ops.zeros([0, 3, 5]))
+      ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       self.assertAllEqual([0, 3, 5], packed.eval().shape)
       # Concatenating zero tensors along their first dimension gives a
diff --git a/tensorflow/python/kernel_tests/tensor_priority_test.py b/tensorflow/python/kernel_tests/tensor_priority_test.py
index b6674c3aa5ee57ee9e1136764bb9b1c077a87783..574538a837a0a112e1a806ddea7a13fe44beacc2 100644
--- a/tensorflow/python/kernel_tests/tensor_priority_test.py
+++ b/tensorflow/python/kernel_tests/tensor_priority_test.py
@@ -19,58 +19,65 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow import Tensor
-from tensorflow import register_tensor_conversion_function
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test as test_lib
 
 
 class TensorPriorityTest(test_lib.TestCase):
 
   def testSupportedRhsWithoutDelegation(self):
+
     class NumpyArraySubclass(np.ndarray):
       pass
-    supported_rhs_without_delegation = (
-      3,
-      3.0,
-      [1.0, 2.0],
-      np.array([1.0, 2.0]),
-      NumpyArraySubclass(shape=(1,2), buffer=np.array([1.0, 2.0])),
-      ops.convert_to_tensor([[1.0, 2.0]]))
+
+    supported_rhs_without_delegation = (3, 3.0, [1.0, 2.0], np.array(
+        [1.0, 2.0]), NumpyArraySubclass(
+            shape=(1, 2), buffer=np.array([1.0, 2.0])),
+                                        ops.convert_to_tensor([[1.0, 2.0]]))
     for rhs in supported_rhs_without_delegation:
       tensor = ops.convert_to_tensor([[10.0, 20.0]])
       res = tensor + rhs
-      self.assertIsInstance(res, Tensor)
+      self.assertIsInstance(res, ops.Tensor)
 
   def testUnsupportedRhsWithoutDelegation(self):
+
     class WithoutReverseAdd(object):
       pass
+
     tensor = ops.convert_to_tensor([[10.0, 20.0]])
     rhs = WithoutReverseAdd()
     with self.assertRaisesWithPredicateMatch(
         TypeError, lambda e: "Expected float" in str(e)):
-      res = tensor + rhs
+      # pylint: disable=pointless-statement
+      tensor + rhs
 
   def testUnsupportedRhsWithDelegation(self):
+
     class WithReverseAdd(object):
+
       def __radd__(self, lhs):
         return "Works!"
+
     tensor = ops.convert_to_tensor([[10.0, 20.0]])
     rhs = WithReverseAdd()
     res = tensor + rhs
     self.assertEqual(res, "Works!")
 
   def testFullDelegationControlUsingRegistry(self):
+
     class NumpyArraySubclass(np.ndarray):
+
       def __radd__(self, lhs):
         return "Works!"
+
     def raise_to_delegate(value, dtype=None, name=None, as_ref=False):
+      del value, dtype, name, as_ref  # Unused.
       raise TypeError
-    register_tensor_conversion_function(NumpyArraySubclass, raise_to_delegate,
-                                        priority=0)
+
+    ops.register_tensor_conversion_function(
+        NumpyArraySubclass, raise_to_delegate, priority=0)
     tensor = ops.convert_to_tensor([[10.0, 20.0]])
-    rhs = NumpyArraySubclass(shape=(1,2), buffer=np.array([1.0, 2.0]))
+    rhs = NumpyArraySubclass(shape=(1, 2), buffer=np.array([1.0, 2.0]))
     res = tensor + rhs
     self.assertEqual(res, "Works!")
 
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 69d1a6f60e1eed15f71d5884be720cceac3171b5..245dcc96db79864e1ee417c433e50fe91d596ed9 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -774,6 +774,11 @@ class VariableScopeTest(test.TestCase):
         self.assertEqual([v.name
                           for v in scope.global_variables()], ["foo/b:0"])
 
+  def testGetVariableWithRefDtype(self):
+    v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
+    # Ensure it is possible to do get_variable with a _ref dtype passed in.
+    _ = variable_scope.get_variable("w", shape=[5, 6], dtype=v.dtype)
+
 
 def axis0_into1_partitioner(shape=None, **unused_kwargs):
   part = [1] * len(shape)
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index ff9a777f191823ba928309402d6c79654e3c56bb..cfcd844800c71d339e6dbb668337bf6ec3e0670c 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -38,7 +38,7 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
-class _Layer(object):
+class Layer(object):
   """Base layer class.
 
   WARNING: Do not subclass this layer unless you know what you are doing:
@@ -80,23 +80,27 @@ class _Layer(object):
       if kwarg not in allowed_kwargs:
         raise TypeError('Keyword argument not understood:', kwarg)
 
-    self._trainable = trainable
-    self._built = False
-    self._trainable_variables = []
-    self._non_trainable_variables = []
+    self.trainable = trainable
+    self.built = False
+    self._trainable_weights = []
+    self._non_trainable_weights = []
     self._updates = []
     self._losses = []
     self._reuse = kwargs.get('_reuse')
     self._graph = ops.get_default_graph()
-    self.dtype = dtype
+    self._per_input_losses = {}
+    self._per_input_updates = {}
+    self.dtype = dtypes.as_dtype(dtype).name
 
-    # Determine base name (non-unique).
+    # Determine layer name (non-unique).
     if isinstance(name, vs.VariableScope):
       base_name = name.name
     else:
       base_name = name
+      self.name = name
     if not name:
       base_name = _to_snake_case(self.__class__.__name__)
+      self.name = _unique_layer_name(base_name)
     self._base_name = base_name
 
     # Determine variable scope.
@@ -106,45 +110,43 @@ class _Layer(object):
     else:
       self._scope = None
 
-    # Unique name is borrowed from scope to match variable names.
-    if self._scope is not None:
-      self._name = self._scope.name
-    else:
-      # No name available until we see a scope
-      self._name = None
-
-  def __setattr__(self, name, value):
-    if hasattr(self, name):
-      # Only allow private attributes to be set more than once, under the
-      # convention that private attributes should only be set from inside
-      # the class.
-      # All attributes meant to be set several times should be set to private.
-      if name[0] != '_':
-        raise AttributeError('Read-only property cannot be set: %s' % name)
-    super(_Layer, self).__setattr__(name, value)
+  @property
+  def scope_name(self):
+    if not self._scope:
+      raise ValueError('No name available for layer scope because the layer "' +
+                       self.name + '" has not been used yet. The scope name ' +
+                       ' is determined the first time the layer instance is ' +
+                       'called. You must therefore call the layer before ' +
+                       'querying `scope_name`.')
+    return self._scope.name
+
+  @property
+  def trainable_weights(self):
+    return self._trainable_weights if self.trainable else []
 
   @property
-  def name(self):
-    if self._name is None:
-      raise ValueError(
-          'No name available for layer because it has not been used yet.')
-    return self._name
+  def non_trainable_weights(self):
+    if self.trainable:
+      return self._non_trainable_weights
+    else:
+      return self._trainable_weights + self._non_trainable_weights
 
   @property
   def trainable_variables(self):
-    return self._trainable_variables if self.trainable else []
+    return self.trainable_weights
 
   @property
   def non_trainable_variables(self):
-    return self._non_trainable_variables if self.trainable else self.variables
+    return self.non_trainable_weights
 
   @property
-  def trainable_weights(self):
-    return self.trainable_variables
+  def weights(self):
+    """Returns the list of all layer variables/weights.
 
-  @property
-  def non_trainable_weights(self):
-    return self.non_trainable_variables
+    Returns:
+      A list of variables.
+    """
+    return self.trainable_weights + self.non_trainable_weights
 
   @property
   def variables(self):
@@ -153,37 +155,141 @@ class _Layer(object):
     Returns:
       A list of variables.
     """
-    return self._trainable_variables + self._non_trainable_variables
+    return self.weights
 
   @property
   def updates(self):
     return self._updates
 
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
+
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing a same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
+
+    Arguments:
+      updates: Update op, or list/tuple of update ops.
+      inputs: Optional input tensor(s) that the update(s) depend on. Must
+        match the `inputs` argument passed to the `__call__` method at the time
+        the updates are created. If `None` is passed, the updates are assumed
+        to be unconditional, and will apply across all dataflows of the layer.
+    """
+    updates = _to_list(updates)
+    if not updates:
+      return
+    self._updates += updates
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      # We compute an ID that uniquely identifies the list of tensors.
+      # This ID is order-sensitive.
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    if inputs_hash not in self._per_input_updates:
+      self._per_input_updates[inputs_hash] = []
+    self._per_input_updates[inputs_hash] += updates
+
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+        Must match the `inputs` argument passed to the `__call__` method
+        at the time the updates were created.
+        If you pass `inputs=None`, unconditional updates are returned.
+
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
+    """
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    return self._per_input_updates.get(inputs_hash, [])
+
   @property
   def losses(self):
     return self._losses
 
-  @property
-  def built(self):
-    return self._built
+  def add_loss(self, losses, inputs=None):
+    """Add loss tensor(s), potentially dependent on layer inputs.
 
-  @property
-  def trainable(self):
-    return self._trainable
+    Some losses (for instance, activity regularization losses) may be dependent
+    on the inputs passed when calling a layer. Hence, when reusing a same layer
+    on different inputs `a` and `b`, some entries in `layer.losses` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
 
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
+    The `get_losses_for` method allows to retrieve the losses relevant to a
+    specific set of inputs.
+
+    Arguments:
+      losses: Loss tensor, or list/tuple of tensors.
+      inputs: Optional input tensor(s) that the loss(es) depend on. Must
+        match the `inputs` argument passed to the `__call__` method at the time
+        the losses are created. If `None` is passed, the losses are assumed
+        to be unconditional, and will apply across all dataflows of the layer
+        (e.g. weight regularization losses).
+    """
+    losses = _to_list(losses)
+    if not losses:
+      return
+    self._losses += losses
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      # We compute an ID that uniquely identifies the list of tensors.
+      # This ID is order-sensitive.
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    if inputs_hash not in self._per_input_losses:
+      self._per_input_losses[inputs_hash] = []
+    self._per_input_losses[inputs_hash] += losses
+
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+        Must match the `inputs` argument passed to the `__call__`
+        method at the time the losses were created.
+        If you pass `inputs=None`, unconditional losses are returned,
+        such as weight regularization losses.
 
     Returns:
-      A list of variables.
+      List of loss tensors of the layer that depend on `inputs`.
     """
-    return self.variables
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    return self._per_input_losses.get(inputs_hash, [])
 
   def build(self, _):
     """Creates the variables of the layer.
     """
-    self._built = True
+    self.built = True
 
   def call(self, inputs, **kwargs):
     """The logic of the layer lives here.
@@ -217,10 +323,19 @@ class _Layer(object):
     """
     raise NotImplementedError
 
-  def _add_variable(self, name, shape, dtype=None,
-                    initializer=None, regularizer=None, trainable=True,
-                    variable_getter=vs.get_variable):
-    """Adds a new variable to the layer.
+  def _set_scope(self, scope=None):
+    if self._scope is None:
+      # If constructed with _scope=None, lazy setting of scope.
+      if self._reuse:
+        self._scope = next(vs.variable_scope(
+            scope if scope is not None else self._base_name).gen)
+      else:
+        self._scope = next(vs.variable_scope(
+            scope, default_name=self._base_name).gen)
+
+  def add_variable(self, name, shape, dtype=None,
+                   initializer=None, regularizer=None, trainable=True):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
       name: variable name.
@@ -231,7 +346,6 @@ class _Layer(object):
       trainable: whether the variable should be part of the layer's
         "trainable_variables" (e.g. variables, biases)
         or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-      variable_getter: The getter to use for TensorFlow variables.
 
     Returns:
       The created variable.
@@ -239,38 +353,43 @@ class _Layer(object):
     if dtype is None:
       dtype = self.dtype
     existing_variables = set(tf_variables.global_variables())
-    variable = variable_getter(name,
-                               shape=shape,
-                               initializer=initializer,
-                               dtype=dtype,
-                               trainable=trainable and self.trainable)
-    # TODO(sguada) fix name = variable.op.name
-    if variable in existing_variables:
-      return variable
-    if regularizer:
-      # To match the behavior of tf.get_variable(), we only
-      # apply regularization if the variable is newly created.
-      if isinstance(variable, tf_variables.PartitionedVariable):
-        for v in variable:
-          with ops.colocate_with(v.op):
-            with ops.name_scope(name + '/Regularizer'):
-              regularization = regularizer(v)
-          if regularization is not None:
-            self._losses.append(regularization)
-            _add_elements_to_collection(
-                regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
-      else:
-        with ops.colocate_with(variable.op):
-          with ops.name_scope(name + '/Regularizer'):
-            regularization = regularizer(variable)
-        if regularization is not None:
-          self._losses.append(regularization)
-          _add_elements_to_collection(
-              regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
+
+    self._set_scope(None)
+
+    with vs.variable_scope(self._scope,
+                           reuse=self.built or self._reuse) as scope:
+      with ops.name_scope(scope.original_name_scope):
+        variable = vs.get_variable(name,
+                                   shape=shape,
+                                   initializer=initializer,
+                                   dtype=dtypes.as_dtype(dtype),
+                                   trainable=trainable and self.trainable)
+        if variable in existing_variables:
+          return variable
+        if regularizer:
+          # To match the behavior of tf.get_variable(), we only
+          # apply regularization if the variable is newly created.
+          if isinstance(variable, tf_variables.PartitionedVariable):
+            for v in variable:
+              with ops.colocate_with(v.op):
+                with ops.name_scope(name + '/Regularizer'):
+                  regularization = regularizer(v)
+              if regularization is not None:
+                self.add_loss(regularization)
+                _add_elements_to_collection(
+                    regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
+          else:
+            with ops.colocate_with(variable.op):
+              with ops.name_scope(name + '/Regularizer'):
+                regularization = regularizer(variable)
+            if regularization is not None:
+              self.add_loss(regularization)
+              _add_elements_to_collection(
+                  regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
     if trainable:
-      self._trainable_variables.append(variable)
+      self._trainable_weights.append(variable)
     else:
-      self._non_trainable_variables.append(variable)
+      self._non_trainable_weights.append(variable)
     return variable
 
   def __call__(self, inputs, *args, **kwargs):
@@ -284,39 +403,17 @@ class _Layer(object):
     Returns:
       Output tensor(s).
     """
-    scope = kwargs.pop('scope', None)
-
-    # Define a custom getter to override tf.get_variable when creating layer
-    # variables. The current custom getter is nested by the variable scope.
-    def variable_getter(getter, name, shape, dtype=None, initializer=None,
-                        regularizer=None, trainable=True, **getter_kwargs):
-      return self._add_variable(
-          name, shape, initializer=initializer, regularizer=regularizer,
-          dtype=dtype, trainable=trainable,
-          variable_getter=functools.partial(getter, **getter_kwargs))
-
-    if not self._built and self._scope is None:
-      # If constructed with _scope=None, lazy setting of scope.
-      if self._reuse:
-        self._scope = next(vs.variable_scope(
-            scope if scope is not None else self._base_name).gen)
-      else:
-        self._scope = next(vs.variable_scope(
-            scope, default_name=self._base_name).gen)
-      self._name = self._scope.name
+    self._set_scope(kwargs.pop('scope', None))
 
-    # Build (if necessary) and call the layer, inside a variable
-    # scope.
-    with vs.variable_scope(self._scope,
-                           reuse=True if self._built else self._reuse,
-                           custom_getter=variable_getter) as scope:
-      # Ensure the Layer, if being reused, is working with inputs from
-      # the same graph as where it was created.
-      try:
-        ops._get_graph_from_inputs(nest.flatten(inputs), graph=self.graph)  # pylint: disable=protected-access
-      except ValueError as e:
-        raise ValueError("Inputs' and Layer's graphs are not the same: %s" % e)
+    # Ensure the Layer, if being reused, is working with inputs from
+    # the same graph as where it was created.
+    try:
+      ops._get_graph_from_inputs(nest.flatten(inputs), graph=self.graph)  # pylint: disable=protected-access
+    except ValueError as e:
+      raise ValueError('Input graph and Layer graph are not the same: %s' % e)
 
+    with vs.variable_scope(self._scope,
+                           reuse=self.built or self._reuse) as scope:
       with ops.name_scope(scope.original_name_scope):
         if not self.built:
           input_list = [
@@ -327,7 +424,6 @@ class _Layer(object):
             self.build(input_shapes[0])
           else:
             self.build(input_shapes)
-          self._built = True
         if 'scope' in tf_inspect.getargspec(self.call).args:
           kwargs['scope'] = scope
         outputs = self.call(inputs, *args, **kwargs)
@@ -340,12 +436,13 @@ class _Layer(object):
           for output in output_list:
             with ops.name_scope('ActivityRegularizer'):
               activity_regularization = self.activity_regularizer(output)
-            self._losses.append(activity_regularization)
+            self.add_loss(activity_regularization)
             _add_elements_to_collection(
                 activity_regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
 
     # Update global default collections.
     _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
+    self.built = True
     return outputs
 
   @property
@@ -419,3 +516,39 @@ def _add_elements_to_collection(elements, collections):
     for element in elements:
       if element not in collection_set:
         collection.append(element)
+
+
+def _object_list_uid(object_list):
+  object_list = _to_list(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def _unique_layer_name(name):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```
+    >>> _unique_layer_name('dense')
+    dense_1
+    >>> _unique_layer_name('dense')
+    dense_2
+  ```
+  """
+  layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS')
+  if not layer_name_uids_collection:
+    layer_name_uids = {}
+    ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids)
+  else:
+    layer_name_uids = layer_name_uids_collection[0]
+  if name not in layer_name_uids:
+    layer_name_uids[name] = 1
+  else:
+    layer_name_uids[name] += 1
+  return name + '_' + str(layer_name_uids[name])
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 83ae1b6e83588f4b1466c33d7b85342b224de9cb..9e2457a4891c73aea42ccf7baaf614bcb7de57e5 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -32,26 +32,24 @@ from tensorflow.python.platform import test
 class BaseLayerTest(test.TestCase):
 
   def testLayerProperties(self):
-    layer = base_layers._Layer(name='my_layer')
+    layer = base_layers.Layer(name='my_layer')
     self.assertListEqual(layer.variables, [])
     self.assertListEqual(layer.trainable_variables, [])
     self.assertListEqual(layer.non_trainable_variables, [])
     self.assertListEqual(layer.updates, [])
     self.assertListEqual(layer.losses, [])
     self.assertEqual(layer.built, False)
-    with self.assertRaisesRegexp(ValueError, 'not been used yet'):
-      _ = layer.name
-    layer = base_layers._Layer(name='my_layer', trainable=False)
+    layer = base_layers.Layer(name='my_layer', trainable=False)
     self.assertEqual(layer.trainable, False)
 
   def testAddWeight(self):
     with self.test_session():
-      layer = base_layers._Layer(name='my_layer')
+      layer = base_layers.Layer(name='my_layer')
 
       # Test basic variable creation.
-      variable = layer._add_variable(
+      variable = layer.add_variable(
           'my_var', [2, 2], initializer=init_ops.zeros_initializer())
-      self.assertEqual(variable.name, 'my_var:0')
+      self.assertEqual(variable.name, 'my_layer/my_var:0')
       self.assertListEqual(layer.variables, [variable])
       self.assertListEqual(layer.trainable_variables, [variable])
       self.assertListEqual(layer.non_trainable_variables, [])
@@ -60,8 +58,8 @@ class BaseLayerTest(test.TestCase):
           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
 
       # Test non-trainable variable creation.
-      # layer._add_variable should work even outside `build` and `call`.
-      variable_2 = layer._add_variable(
+      # layer.add_variable should work even outside `build` and `call`.
+      variable_2 = layer.add_variable(
           'non_trainable_var', [2, 2],
           initializer=init_ops.zeros_initializer(),
           trainable=False)
@@ -73,7 +71,7 @@ class BaseLayerTest(test.TestCase):
 
       # Test with regularizer.
       regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
-      variable = layer._add_variable(
+      variable = layer.add_variable(
           'reg_var', [2, 2],
           initializer=init_ops.zeros_initializer(),
           regularizer=regularizer)
@@ -81,81 +79,70 @@ class BaseLayerTest(test.TestCase):
 
   def testGetVariable(self):
     with self.test_session():
-      # From inside `build` and `call` it should be possible to use
-      # either tf.get_variable
 
-      class MyLayer(base_layers._Layer):
+      class MyLayer(base_layers.Layer):
 
         def build(self, input_shape):
-          self.my_var = variable_scope.get_variable(
+          self.my_var = self.add_variable(
               'my_var', [2, 2], initializer=init_ops.zeros_initializer())
 
         def call(self, inputs):
-          variable_scope.get_variable(
-              'my_call_var', [2, 2], initializer=init_ops.zeros_initializer())
-          return inputs
+          return inputs * 2
 
       layer = MyLayer(name='my_layer')
       inputs = random_ops.random_uniform((5,), seed=1)
       layer.apply(inputs)
       layer.apply(inputs)
       self.assertListEqual([v.name for v in layer.variables],
-                           ['my_layer/my_var:0', 'my_layer/my_call_var:0'])
+                           ['my_layer/my_var:0'])
 
       # Creating a layer with no scope leads to lazy construction of
       # the scope at apply() time.  It uses scope "<current scope>/base_name"
       lazy_layer = MyLayer(_reuse=True)
       with variable_scope.variable_scope('new_scope'):
-        # This should attempt to reuse 'my_var' and 'my_call_var' in 'new_scope'
+        # This should attempt to reuse 'my_var' in 'new_scope'
         with self.assertRaisesRegexp(
             ValueError, r'new_scope/my_layer/my_var does not exist'):
           lazy_layer.apply(inputs)
         with variable_scope.variable_scope('my_layer'):
           variable_scope.get_variable('my_var', [2, 2])
-        with self.assertRaisesRegexp(
-            ValueError, r'new_scope/my_layer/my_call_var does not exist'):
-          lazy_layer.apply(inputs)
-        with variable_scope.variable_scope('my_layer'):
-          variable_scope.get_variable('my_call_var', [2, 2])
+
         # Smoke test: it runs.
         lazy_layer.apply(inputs)
         # The variables were created outside of the Layer, and
         # reuse=True, so the Layer does not own them and they are not
         # stored in its collection.
         self.assertListEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer.name, 'new_scope/my_layer')
+        self.assertEqual(lazy_layer._scope.name, 'new_scope/my_layer')
 
       # Creating a layer with no scope leads to lazy construction of
       # the scope at apply() time.  If 'scope' argument is passed to
       # apply(), it uses that scope when accessing variables.
       lazy_layer = MyLayer(_reuse=True)
       with variable_scope.variable_scope('new_scope') as new_scope:
-        # This should attempt to reuse 'my_var' and 'my_call_var' in 'new_scope'
+        # This should attempt to reuse 'my_var' in 'new_scope'
         with self.assertRaisesRegexp(
             ValueError, r'new_scope/my_var does not exist'):
           lazy_layer.apply(inputs, scope=new_scope)
         variable_scope.get_variable('my_var', [2, 2])
-        with self.assertRaisesRegexp(
-            ValueError, r'new_scope/my_call_var does not exist'):
-          lazy_layer.apply(inputs, scope=new_scope)
-        variable_scope.get_variable('my_call_var', [2, 2])
+
         # Smoke test: it runs.
         lazy_layer.apply(inputs, scope=new_scope)
         # The variables were created outside of the Layer, and
         # reuse=True, so the Layer does not own them and they are not
         # stored in its collection.
         self.assertListEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer.name, 'new_scope')
+        self.assertEqual(lazy_layer._scope.name, 'new_scope')
 
       with ops.Graph().as_default():
         inputs_ng = random_ops.random_uniform((5,), seed=1)
         with self.assertRaisesRegexp(ValueError,
-                                     r'graphs are not the same'):
+                                     r'graph are not the same'):
           layer.apply(inputs_ng)
 
   def testCall(self):
 
-    class MyLayer(base_layers._Layer):
+    class MyLayer(base_layers.Layer):
 
       def call(self, inputs):
         return math_ops.square(inputs)
@@ -166,9 +153,39 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.built, True)
     self.assertEqual(outputs.op.name, 'my_layer/Square')
 
+  def testFirstCallCanCreateVariablesButSecondCanNotWhenBuildEmpty(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, _):
+        # Do not mark the layer as built.
+        pass
+
+      def call(self, inputs):
+        self.my_var = self.add_variable('my_var', [2, 2])
+        if self.built:
+          # Skip creating on the first call; try to create after it's
+          # built.  This is expected to fail.
+          self.add_variable('this_will_break_on_second_call', [2, 2])
+        return inputs + math_ops.square(self.my_var)
+
+    layer = MyLayer(name='my_layer')
+    inputs = random_ops.random_uniform((2,), seed=1)
+    outputs = layer.apply(inputs)
+    self.assertEqual(layer.built, True)
+    self.assertEqual(outputs.op.name, 'my_layer/add')
+    self.assertListEqual(
+        [v.name for v in layer.variables], ['my_layer/my_var:0'])
+    with self.assertRaisesRegexp(ValueError,
+                                 'my_layer/this_will_break_on_second_call'):
+      layer.apply(inputs)
+    # The list of variables hasn't changed.
+    self.assertListEqual(
+        [v.name for v in layer.variables], ['my_layer/my_var:0'])
+
   def testDeepCopy(self):
 
-    class MyLayer(base_layers._Layer):
+    class MyLayer(base_layers.Layer):
 
       def call(self, inputs):
         return math_ops.square(inputs)
@@ -184,9 +201,9 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer_copy._scope.name, layer._scope.name)
     self.assertEqual(layer_copy._graph, layer._graph)
 
-  def testNaming(self):
+  def testScopeNaming(self):
 
-    class PrivateLayer(base_layers._Layer):
+    class PrivateLayer(base_layers.Layer):
 
       def call(self, inputs):
         return None
@@ -194,41 +211,42 @@ class BaseLayerTest(test.TestCase):
     inputs = random_ops.random_uniform((5,))
     default_layer = PrivateLayer()
     _ = default_layer.apply(inputs)
-    self.assertEqual(default_layer.name, 'private_layer')
+    self.assertEqual(default_layer._scope.name, 'private_layer')
     default_layer1 = PrivateLayer()
     default_layer1.apply(inputs)
-    self.assertEqual(default_layer1.name, 'private_layer_1')
+    self.assertEqual(default_layer1._scope.name, 'private_layer_1')
     my_layer = PrivateLayer(name='my_layer')
     my_layer.apply(inputs)
-    self.assertEqual(my_layer.name, 'my_layer')
+    self.assertEqual(my_layer._scope.name, 'my_layer')
     my_layer1 = PrivateLayer(name='my_layer')
     my_layer1.apply(inputs)
-    self.assertEqual(my_layer1.name, 'my_layer_1')
+    self.assertEqual(my_layer1._scope.name, 'my_layer_1')
     my_layer2 = PrivateLayer(name='my_layer')
     my_layer2.apply(inputs)
-    self.assertEqual(my_layer2.name, 'my_layer_2')
+    self.assertEqual(my_layer2._scope.name, 'my_layer_2')
     # Name scope shouldn't affect names.
     with ops.name_scope('some_name_scope'):
       default_layer2 = PrivateLayer()
       default_layer2.apply(inputs)
-      self.assertEqual(default_layer2.name, 'private_layer_2')
+      self.assertEqual(default_layer2._scope.name, 'private_layer_2')
       my_layer3 = PrivateLayer(name='my_layer')
       my_layer3.apply(inputs)
-      self.assertEqual(my_layer3.name, 'my_layer_3')
+      self.assertEqual(my_layer3._scope.name, 'my_layer_3')
       other_layer = PrivateLayer(name='other_layer')
       other_layer.apply(inputs)
-      self.assertEqual(other_layer.name, 'other_layer')
-    # Variable scope gets added to names.
+      self.assertEqual(other_layer._scope.name, 'other_layer')
+    # Variable scope gets added to scope names.
     with variable_scope.variable_scope('var_scope'):
       default_layer_scoped = PrivateLayer()
       default_layer_scoped.apply(inputs)
-      self.assertEqual(default_layer_scoped.name, 'var_scope/private_layer')
+      self.assertEqual(default_layer_scoped._scope.name,
+                       'var_scope/private_layer')
       my_layer_scoped = PrivateLayer(name='my_layer')
       my_layer_scoped.apply(inputs)
-      self.assertEqual(my_layer_scoped.name, 'var_scope/my_layer')
+      self.assertEqual(my_layer_scoped._scope.name, 'var_scope/my_layer')
       my_layer_scoped1 = PrivateLayer(name='my_layer')
       my_layer_scoped1.apply(inputs)
-      self.assertEqual(my_layer_scoped1.name, 'var_scope/my_layer_1')
+      self.assertEqual(my_layer_scoped1._scope.name, 'var_scope/my_layer_1')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 04fec38b2111c5daf62658a57e0bbe05610249b4..707d30b0e0f2a1456b2d22eac0bc5a052deb6966 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -37,14 +37,14 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 
 
-class _Conv(base._Layer):  # pylint: disable=protected-access
+class _Conv(base.Layer):
   """Abstract nD convolution layer (private, used as implementation base).
 
   This layer creates a convolution kernel that is convolved
   (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
+  outputs. If `use_bias` is True, a bias vector is created and added to the
+  outputs. Finally, if `activation` is not `None`, it is applied to the outputs
+  as well.
 
   Arguments:
     rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
@@ -70,8 +70,8 @@ class _Conv(base._Layer):  # pylint: disable=protected-access
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -130,21 +130,22 @@ class _Conv(base._Layer):  # pylint: disable=protected-access
     input_dim = input_shape[channel_axis].value
     kernel_shape = self.kernel_size + (input_dim, self.filters)
 
-    self.kernel = vs.get_variable('kernel',
-                                  shape=kernel_shape,
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     outputs = nn.convolution(
@@ -192,9 +193,9 @@ class Conv1D(_Conv):
 
   This layer creates a convolution kernel that is convolved
   (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
+  outputs. If `use_bias` is True, a bias vector is created and added to the
+  outputs. Finally, if `activation` is not `None`, it is applied to the outputs
+  as well.
 
   Arguments:
     filters: Integer, the dimensionality of the output space (i.e. the number
@@ -219,8 +220,8 @@ class Conv1D(_Conv):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -285,9 +286,9 @@ def conv1d(inputs,
 
   This layer creates a convolution kernel that is convolved
   (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
+  outputs. If `use_bias` is True, a bias vector is created and added to the
+  outputs. Finally, if `activation` is not `None`, it is applied to the outputs
+  as well.
 
   Arguments:
     inputs: Tensor input.
@@ -313,8 +314,8 @@ def conv1d(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -353,9 +354,9 @@ class Conv2D(_Conv):
 
   This layer creates a convolution kernel that is convolved
   (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
+  outputs. If `use_bias` is True, a bias vector is created and added to the
+  outputs. Finally, if `activation` is not `None`, it is applied to the outputs
+  as well.
 
   Arguments:
     filters: Integer, the dimensionality of the output space (i.e. the number
@@ -387,8 +388,8 @@ class Conv2D(_Conv):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -453,9 +454,9 @@ def conv2d(inputs,
 
   This layer creates a convolution kernel that is convolved
   (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
+  outputs. If `use_bias` is True, a bias vector is created and added to the
+  outputs. Finally, if `activation` is not `None`, it is applied to the outputs
+  as well.
 
   Arguments:
     inputs: Tensor input.
@@ -488,8 +489,8 @@ def conv2d(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -528,9 +529,9 @@ class Conv3D(_Conv):
 
   This layer creates a convolution kernel that is convolved
   (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
+  outputs. If `use_bias` is True, a bias vector is created and added to the
+  outputs. Finally, if `activation` is not `None`, it is applied to the outputs
+  as well.
 
   Arguments:
     filters: Integer, the dimensionality of the output space (i.e. the number
@@ -563,8 +564,8 @@ class Conv3D(_Conv):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -629,9 +630,9 @@ def conv3d(inputs,
 
   This layer creates a convolution kernel that is convolved
   (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
+  outputs. If `use_bias` is True, a bias vector is created and added to the
+  outputs. Finally, if `activation` is not `None`, it is applied to the outputs
+  as well.
 
   Arguments:
     inputs: Tensor input.
@@ -665,8 +666,8 @@ def conv3d(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -703,11 +704,10 @@ def conv3d(inputs,
 class SeparableConv2D(Conv2D):
   """Depthwise separable 2D convolution.
 
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
+  This layer performs a depthwise convolution that acts separately on channels,
+  followed by a pointwise convolution that mixes channels. If `use_bias` is
+  True, it adds a bias vector to the output. It then optionally applies an
+  activation function to produce the final output.
 
   Arguments:
     filters: Integer, the dimensionality of the output space (i.e. the number
@@ -741,8 +741,8 @@ class SeparableConv2D(Conv2D):
     use_bias: Boolean, whether the layer uses a bias.
     depthwise_initializer: An initializer for the depthwise convolution kernel.
     pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     depthwise_regularizer: Optional regularizer for the depthwise
       convolution kernel.
     pointwise_regularizer: Optional regularizer for the pointwise
@@ -814,29 +814,30 @@ class SeparableConv2D(Conv2D):
                               self.depth_multiplier * input_dim,
                               self.filters)
 
-    self.depthwise_kernel = vs.get_variable(
-        'depthwise_kernel',
+    self.depthwise_kernel = self.add_variable(
+        name='depthwise_kernel',
         shape=depthwise_kernel_shape,
         initializer=self.depthwise_initializer,
         regularizer=self.depthwise_regularizer,
         trainable=True,
         dtype=self.dtype)
-    self.pointwise_kernel = vs.get_variable(
-        'pointwise_kernel',
+    self.pointwise_kernel = self.add_variable(
+        name='pointwise_kernel',
         shape=pointwise_kernel_shape,
         initializer=self.pointwise_initializer,
         regularizer=self.pointwise_regularizer,
         trainable=True,
         dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     if self.data_format == 'channels_first':
@@ -889,11 +890,10 @@ def separable_conv2d(inputs,
                      reuse=None):
   """Functional interface for the depthwise separable 2D convolution layer.
 
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
+  This layer performs a depthwise convolution that acts separately on channels,
+  followed by a pointwise convolution that mixes channels. If `use_bias` is
+  True, it adds a bias vector to the output. It then optionally applies an
+  activation function to produce the final output.
 
   Arguments:
     inputs: Input tensor.
@@ -928,8 +928,8 @@ def separable_conv2d(inputs,
     use_bias: Boolean, whether the layer uses a bias.
     depthwise_initializer: An initializer for the depthwise convolution kernel.
     pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     depthwise_regularizer: Optional regularizer for the depthwise
       convolution kernel.
     pointwise_regularizer: Optional regularizer for the pointwise
@@ -972,12 +972,11 @@ def separable_conv2d(inputs,
 class Conv2DTranspose(Conv2D):
   """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
-  The need for transposed convolutions generally arises
-  from the desire to use a transformation going in the opposite direction
-  of a normal convolution, i.e., from something that has the shape of the
-  output of some convolution to something that has the shape of its input
-  while maintaining a connectivity pattern that is compatible with
-  said convolution.
+  The need for transposed convolutions generally arises from the desire to use a
+  transformation going in the opposite direction of a normal convolution, i.e.,
+  from something that has the shape of the output of some convolution to
+  something that has the shape of its input while maintaining a connectivity
+  pattern that is compatible with said convolution.
 
   Arguments:
     filters: Integer, the dimensionality of the output space (i.e. the number
@@ -998,8 +997,8 @@ class Conv2DTranspose(Conv2D):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -1055,21 +1054,22 @@ class Conv2DTranspose(Conv2D):
     input_dim = input_shape[channel_axis]
     kernel_shape = self.kernel_size + (self.filters, input_dim)
 
-    self.kernel = vs.get_variable('kernel',
-                                  shape=kernel_shape,
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -1106,10 +1106,10 @@ class Conv2DTranspose(Conv2D):
     # Infer the static output shape:
     out_shape = inputs.get_shape().as_list()
     out_shape[c_axis] = self.filters
-    out_shape[h_axis] = utils.get_deconv_dim(
-        out_shape[h_axis], stride_h, kernel_h, self.padding)
-    out_shape[w_axis] = utils.get_deconv_dim(
-        out_shape[w_axis], stride_w, kernel_w, self.padding)
+    out_shape[h_axis] = utils.get_deconv_dim(out_shape[h_axis], stride_h,
+                                             kernel_h, self.padding)
+    out_shape[w_axis] = utils.get_deconv_dim(out_shape[w_axis], stride_w,
+                                             kernel_w, self.padding)
     outputs.set_shape(out_shape)
 
     if self.bias:
@@ -1141,12 +1141,11 @@ def conv2d_transpose(inputs,
                      reuse=None):
   """Functional interface for transposed 2D convolution layer.
 
-  The need for transposed convolutions generally arises
-  from the desire to use a transformation going in the opposite direction
-  of a normal convolution, i.e., from something that has the shape of the
-  output of some convolution to something that has the shape of its input
-  while maintaining a connectivity pattern that is compatible with
-  said convolution.
+  The need for transposed convolutions generally arises from the desire to use a
+  transformation going in the opposite direction of a normal convolution, i.e.,
+  from something that has the shape of the output of some convolution to
+  something that has the shape of its input while maintaining a connectivity
+  pattern that is compatible with said convolution.
 
   Arguments:
     inputs: Input tensor.
@@ -1168,8 +1167,8 @@ def conv2d_transpose(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If `None`, then no
-      bias will be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -1227,8 +1226,8 @@ class Conv3DTranspose(Conv3D):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If `None`, then no
-      bias will be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -1237,7 +1236,8 @@ class Conv3DTranspose(Conv3D):
     name: A string, the name of the layer.
   """
 
-  def __init__(self, filters,
+  def __init__(self,
+               filters,
                kernel_size,
                strides=(1, 1, 1),
                padding='valid',
@@ -1266,12 +1266,13 @@ class Conv3DTranspose(Conv3D):
         bias_regularizer=bias_regularizer,
         activity_regularizer=activity_regularizer,
         trainable=trainable,
-        name=name, **kwargs)
+        name=name,
+        **kwargs)
 
   def build(self, input_shape):
     if len(input_shape) != 5:
-      raise ValueError('Inputs should have rank 5, ' +
-                       'received input shape:', str(input_shape))
+      raise ValueError('Inputs should have rank 5, received input shape:',
+                       str(input_shape))
     if self.data_format == 'channels_first':
       channel_axis = 1
     else:
@@ -1282,19 +1283,21 @@ class Conv3DTranspose(Conv3D):
     input_dim = input_shape[channel_axis]
     kernel_shape = self.kernel_size + (self.filters, input_dim)
 
-    self.kernel = vs.get_variable('kernel',
-                                  shape=kernel_shape,
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+    self.kernel = self.add_variable(
+        'kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        trainable=True,
+        dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(
+          'bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          trainable=True,
+          dtype=self.dtype)
     else:
       self.bias = None
 
@@ -1339,26 +1342,26 @@ class Conv3DTranspose(Conv3D):
     # Infer the static output shape:
     out_shape = inputs.get_shape().as_list()
     out_shape[c_axis] = self.filters
-    out_shape[d_axis] = utils.get_deconv_dim(
-        out_shape[d_axis], stride_d, kernel_d, self.padding)
-    out_shape[h_axis] = utils.get_deconv_dim(
-        out_shape[h_axis], stride_h, kernel_h, self.padding)
-    out_shape[w_axis] = utils.get_deconv_dim(
-        out_shape[w_axis], stride_w, kernel_w, self.padding)
+    out_shape[d_axis] = utils.get_deconv_dim(out_shape[d_axis], stride_d,
+                                             kernel_d, self.padding)
+    out_shape[h_axis] = utils.get_deconv_dim(out_shape[h_axis], stride_h,
+                                             kernel_h, self.padding)
+    out_shape[w_axis] = utils.get_deconv_dim(out_shape[w_axis], stride_w,
+                                             kernel_w, self.padding)
     outputs.set_shape(out_shape)
 
     if self.bias:
       outputs_shape = outputs.shape.as_list()
       if self.data_format == 'channels_first':
-        outputs_4d = array_ops.reshape(outputs,
-                                       [outputs_shape[0], outputs_shape[1],
-                                        outputs_shape[2] * outputs_shape[3],
-                                        outputs_shape[4]])
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1],
+            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
+        ])
       else:
-        outputs_4d = array_ops.reshape(outputs,
-                                       [outputs_shape[0],
-                                        outputs_shape[1] * outputs_shape[2],
-                                        outputs_shape[3], outputs_shape[4]])
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
+            outputs_shape[3], outputs_shape[4]
+        ])
       outputs_4d = nn.bias_add(
           outputs_4d,
           self.bias,
@@ -1408,8 +1411,8 @@ def conv3d_transpose(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. Will be ignored if
+      `use_bias` is `False`.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -1456,4 +1459,3 @@ convolution3d = conv3d
 separable_convolution2d = separable_conv2d
 convolution2d_transpose = deconvolution2d = deconv2d = conv2d_transpose
 convolution3d_transpose = deconvolution3d = deconv3d = conv3d_transpose
-
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index 635cc24714c7e3794fe5ed5b79a57f0b8d542dd8..42a2d775349042f20c48e40385c440705d92912b 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -715,8 +715,8 @@ class Conv3DTransposeTest(test.TestCase):
     layer = conv_layers.Conv3DTranspose(
         32, volumes.get_shape()[1:4], padding='same')
     output = layer.apply(volumes)
-    self.assertListEqual(output.get_shape().as_list(), [5, depth, height,
-                                                        width, 32])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 32])
 
   def testCreateConv3DTransposeWithStrides(self):
     depth, height, width = 4, 6, 8
@@ -729,8 +729,7 @@ class Conv3DTransposeTest(test.TestCase):
                          [5, depth * 2, height * 2, width * 2, 4])
 
     # Test strides integer.
-    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], strides=2,
-                                        padding='same')
+    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], strides=2, padding='same')
     output = layer.apply(volumes)
     self.assertListEqual(output.get_shape().as_list(),
                          [5, depth * 2, height * 2, width * 2, 4])
@@ -779,14 +778,14 @@ class Conv3DTransposeTest(test.TestCase):
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
     conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
     self.assertEqual(len(variables.trainable_variables()), 2)
-    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
+    conv_layers.conv3d_transpose(
+        volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
   def testFunctionalConv3DTransposeReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       depth, height, width = 5, 7, 9
-      volumes = random_ops.random_uniform((5, depth, height, width, 32),
-                                          seed=1)
+      volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
       conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
     with variable_scope.variable_scope('scope', reuse=True):
@@ -798,8 +797,8 @@ class Conv3DTransposeTest(test.TestCase):
       with variable_scope.variable_scope(
           'scope', initializer=init_ops.ones_initializer()):
         depth, height, width = 5, 7, 9
-        volumes = random_ops.random_uniform((5, depth, height, width, 32),
-                                            seed=1)
+        volumes = random_ops.random_uniform(
+            (5, depth, height, width, 32), seed=1)
         conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
         weights = variables.trainable_variables()
         # Check the names of weights in order.
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index b5846ae3d2f8d53e99de1e292800bd0863fe7695..1ec4e51e5ea7e20849670bb566e65e55e21be917 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -38,7 +38,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 
 
-class Dense(base._Layer):  # pylint: disable=protected-access
+class Dense(base.Layer):
   """Densely-connected layer class.
 
   This layer implements the operation:
@@ -115,21 +115,22 @@ class Dense(base._Layer):  # pylint: disable=protected-access
     # weight of the layer. If the layer is not trainable
     # (self.trainable = False), the variable will not be added to
     # tf.trainable_variables(), and self.trainable_weights will be empty.
-    self.kernel = vs.get_variable('kernel',
-                                  shape=[input_shape[-1].value, self.units],
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  dtype=self.dtype,
-                                  trainable=True)
+    self.kernel = self.add_variable('kernel',
+                                    shape=[input_shape[-1].value, self.units],
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    dtype=self.dtype,
+                                    trainable=True)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=[self.units,],
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  dtype=self.dtype,
-                                  trainable=True)
+      self.bias = self.add_variable('bias',
+                                    shape=[self.units,],
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    dtype=self.dtype,
+                                    trainable=True)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
@@ -219,7 +220,7 @@ def dense(
   return layer.apply(inputs)
 
 
-class Dropout(base._Layer):  # pylint: disable=protected-access
+class Dropout(base.Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting a fraction `rate` of input units to 0
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index df650535d4e21d9d48ba6ac3e1317efeb0bdd2a3..39399691590a7c59c7b58f2acf9433e3ee1aa18f 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -44,16 +44,14 @@ class DenseTest(test.TestCase):
     self.assertEqual(dense.bias_regularizer, None)
     self.assertEqual(dense.activity_regularizer, None)
     self.assertEqual(dense.use_bias, True)
-    with self.assertRaisesRegexp(ValueError, 'not been used yet'):
-      _ = dense.name
 
     # Test auto-naming
     dense = core_layers.Dense(2, activation=nn_ops.relu)
     dense.apply(np.random.randn(0, 2))
-    self.assertEqual(dense.name, 'dense')
+    self.assertEqual(dense.name, 'dense_1')
     dense = core_layers.Dense(2, activation=nn_ops.relu)
     dense.apply(np.random.randn(0, 2))
-    self.assertEqual(dense.name, 'dense_1')
+    self.assertEqual(dense.name, 'dense_2')
 
   def testCall(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
@@ -62,8 +60,6 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.variables, [dense.kernel, dense.bias])
     self.assertListEqual(dense.trainable_variables, [dense.kernel, dense.bias])
     self.assertListEqual(dense.non_trainable_variables, [])
-    self.assertListEqual(dense._trainable_variables, [dense.kernel, dense.bias])
-    self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
     self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
@@ -89,8 +85,6 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.non_trainable_variables,
                          [dense.kernel, dense.bias])
     self.assertListEqual(dense.trainable_variables, [])
-    self.assertListEqual(dense._trainable_variables, [dense.kernel, dense.bias])
-    self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 0)
 
@@ -289,7 +283,7 @@ class DenseTest(test.TestCase):
 class DropoutTest(test.TestCase):
 
   def testDropoutProperties(self):
-    dp = core_layers.Dropout(0.5)
+    dp = core_layers.Dropout(0.5, name='dropout')
     self.assertEqual(dp.rate, 0.5)
     self.assertEqual(dp.noise_shape, None)
     dp.apply(np.ones(()))
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 41846ae0cd7b584c0215978c0980b0302a9f93c8..bbb11dbfe1576604306188df4277aa49b38a62fe 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -41,7 +41,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 
 
-class BatchNormalization(base._Layer):  # pylint: disable=protected-access
+class BatchNormalization(base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
   "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -143,33 +143,33 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
                        input_shape)
 
     if self.center:
-      self.beta = vs.get_variable('beta',
-                                  shape=(param_dim,),
-                                  initializer=self.beta_initializer,
-                                  regularizer=self.beta_regularizer,
-                                  trainable=True)
+      self.beta = self.add_variable(name='beta',
+                                    shape=(param_dim,),
+                                    initializer=self.beta_initializer,
+                                    regularizer=self.beta_regularizer,
+                                    trainable=True)
     else:
       self.beta = None
     if self.scale:
-      self.gamma = vs.get_variable('gamma',
-                                   shape=(param_dim,),
-                                   initializer=self.gamma_initializer,
-                                   regularizer=self.gamma_regularizer,
-                                   trainable=True)
+      self.gamma = self.add_variable(name='gamma',
+                                     shape=(param_dim,),
+                                     initializer=self.gamma_initializer,
+                                     regularizer=self.gamma_regularizer,
+                                     trainable=True)
     else:
       self.gamma = None
 
     # Disable variable partitioning when creating the moving mean and variance
-    partitioner = vs.get_variable_scope().partitioner
+    partitioner = self._scope.partitioner
     try:
-      vs.get_variable_scope().set_partitioner(None)
-      self.moving_mean = vs.get_variable(
-          'moving_mean',
+      self._scope.set_partitioner(None)
+      self.moving_mean = self.add_variable(
+          name='moving_mean',
           shape=(param_dim,),
           initializer=self.moving_mean_initializer,
           trainable=False)
-      self.moving_variance = vs.get_variable(
-          'moving_variance',
+      self.moving_variance = self.add_variable(
+          name='moving_variance',
           shape=(param_dim,),
           initializer=self.moving_variance_initializer,
           trainable=False)
@@ -182,10 +182,10 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
         # stack to be cleared. The nested ones use a `lambda` to set the desired
         # device and ignore any devices that may be set by the custom getter.
         def _renorm_variable(name, shape):
-          var = vs.get_variable(name,
-                                shape=shape,
-                                initializer=init_ops.zeros_initializer(),
-                                trainable=False)
+          var = self.add_variable(name=name,
+                                  shape=shape,
+                                  initializer=init_ops.zeros_initializer(),
+                                  trainable=False)
           return var
         with ops.device(None):
           with ops.device(lambda _: self.moving_mean.device):
@@ -200,7 +200,8 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
             self.renorm_stddev_weight = _renorm_variable(
                 'renorm_stddev_weight', ())
     finally:
-      vs.get_variable_scope().set_partitioner(partitioner)
+      self._scope.set_partitioner(partitioner)
+    self.built = True
 
   def _renorm_correction_and_moments(self, mean, variance, training):
     """Returns the correction and update values for renorm."""
@@ -313,18 +314,15 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
           self.moving_variance, new_variance, decay, zero_debias=False)
 
       if not self.updates:
-        # In the future this should be refactored into a self.add_update
-        # methods in order to allow for instance-based BN layer sharing
-        # across unrelated input streams (e.g. like in Keras).
-        self.updates.append(mean_update)
-        self.updates.append(variance_update)
+        self.add_update(mean_update)
+        self.add_update(variance_update)
 
     else:
       mean, variance = self.moving_mean, self.moving_variance
 
     def _broadcast(v):
       if needs_broadcasting and v is not None:
-        # In this case we must explictly broadcast all parameters.
+        # In this case we must explicitly broadcast all parameters.
         return array_ops.reshape(v, broadcast_shape)
       return v
 
@@ -367,12 +365,14 @@ def batch_normalization(inputs,
   Note: the operations which update the `moving_mean` and `moving_variance`
   variables will not be added as dependencies of your training operation and so
   must be run separately. For example:
+
   ```
   extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
   sess.run([train_op, extra_update_ops], ...)
   ```
   Alternatively, add the operations as a dependency to your training operation
   manually, and then just run your training operation as normal:
+
   ```
   extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
   with tf.control_dependencies(extra_update_ops):
@@ -402,7 +402,9 @@ def batch_normalization(inputs,
     training: Either a Python boolean, or a TensorFlow boolean scalar tensor
       (e.g. a placeholder). Whether to return the output in training mode
       (normalized with statistics of the current batch) or in inference mode
-      (normalized with moving statistics).
+      (normalized with moving statistics). **NOTE**: make sure to set this
+      parameter correctly, or else your training/inference will not work
+      properly.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     name: String, the name of the layer.
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 3e40423ad638f4e0cb1c231fdc9cfa71cbb16f72..a1dfab09de399421624ef1687bd09236aea7419b 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -36,7 +36,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 
 
-class _Pooling1D(base._Layer):  # pylint: disable=protected-access
+class _Pooling1D(base.Layer):
   """Pooling layer for arbitrary pooling functions, for 1D inputs.
 
   This class only exists for code reuse. It will never be an exposed API.
@@ -71,6 +71,7 @@ class _Pooling1D(base._Layer):  # pylint: disable=protected-access
     if len(input_shape) != 3:
       raise ValueError('Inputs should have rank 3. '
                        'Received input shape:', str(input_shape))
+    self.built = True
 
   def call(self, inputs):
     # There is no TF op for 1D pooling, hence we make the inputs 4D.
@@ -222,7 +223,7 @@ def max_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
-class _Pooling2D(base._Layer):  # pylint: disable=protected-access
+class _Pooling2D(base.Layer):
   """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
 
   This class only exists for code reuse. It will never be an exposed API.
@@ -261,6 +262,7 @@ class _Pooling2D(base._Layer):  # pylint: disable=protected-access
     if len(input_shape) != 4:
       raise ValueError('Inputs should have rank 4. '
                        'Received input shape:', str(input_shape))
+    self.built = True
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
@@ -407,7 +409,7 @@ def max_pooling2d(inputs,
   return layer.apply(inputs)
 
 
-class _Pooling3D(base._Layer):  # pylint: disable=protected-access
+class _Pooling3D(base.Layer):
   """Pooling layer for arbitrary pooling functions, for 3D inputs.
 
   This class only exists for code reuse. It will never be an exposed API.
@@ -448,6 +450,7 @@ class _Pooling3D(base._Layer):  # pylint: disable=protected-access
     if len(input_shape) != 5:
       raise ValueError('Inputs should have rank 5. '
                        'Received input shape:', str(input_shape))
+    self.built = True
 
   def call(self, inputs):
     pool_shape = (1,) + self.pool_size + (1,)
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 3e53f1d69fcf76124ba19ddb765501569d4726e6..89e93a86a9af3f9455bb5294557f7c401100fe71 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -261,7 +261,7 @@ class NumpyTensorBuffer : public TensorBuffer {
     proto->set_requested_bytes(rb);
     proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
   }
-  Tensor MakeTensor(DataType dtype, TensorShape shape) {
+  Tensor MakeTensor(DataType dtype, const TensorShape& shape) {
     CHECK_EQ(len_, shape.num_elements() * DataTypeSize(dtype));
     return Tensor(dtype, shape, this);
   }
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 7a91bbd3e01b4b9a1901b1ff2acbdff8b364b513..c212d2071f216881b58c6a2a37626eaebd3be4ca 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -24,6 +24,8 @@ from __future__ import print_function
 import os
 import uuid
 
+import six
+
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
@@ -144,7 +146,7 @@ class FileIO(object):
     # This check exists so that we can convert back to having offset be a
     # positional argument.
     # TODO(jhseu): Make `offset` a positional argument after `position` is
-    # deprecated.
+    # deleted.
     if offset is None and position is None:
       raise TypeError("seek(): offset argument required")
     if offset is not None and position is not None:
@@ -304,23 +306,33 @@ def write_string_to_file(filename, file_content):
 
 
 def get_matching_files(filename):
-  """Returns a list of files that match the given pattern.
+  """Returns a list of files that match the given pattern(s).
 
   Args:
-    filename: string, the pattern
+    filename: string or iterable of strings. The glob pattern(s).
 
   Returns:
-    Returns a list of strings containing filenames that match the given pattern.
+    A list of strings containing filenames that match the given pattern(s).
 
   Raises:
     errors.OpError: If there are filesystem / directory listing errors.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    # Convert each element to string, since the return values of the
-    # vector of string should be interpreted as strings, not bytes.
-    return [compat.as_str_any(matching_filename)
-            for matching_filename in pywrap_tensorflow.GetMatchingFiles(
-                compat.as_bytes(filename), status)]
+    if isinstance(filename, six.string_types):
+      return [
+          # Convert the filenames to string from bytes.
+          compat.as_str_any(matching_filename)
+          for matching_filename in pywrap_tensorflow.GetMatchingFiles(
+              compat.as_bytes(filename), status)
+      ]
+    else:
+      return [
+          # Convert the filenames to string from bytes.
+          compat.as_str_any(matching_filename)
+          for single_filename in filename
+          for matching_filename in pywrap_tensorflow.GetMatchingFiles(
+              compat.as_bytes(single_filename), status)
+      ]
 
 
 def create_dir(dirname):
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index 9ad04508555b2f7ae953c34f7e4c09443ed7abc6..e60b93b84fbb09e984a19f3e5a4920ac8e29dd28 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -129,6 +129,12 @@ class FileIoTest(test.TestCase):
     self.assertItemsEqual(
         file_io.get_matching_files(os.path.join(dir_path, "file*.txt")),
         expected_match)
+    self.assertItemsEqual(file_io.get_matching_files(tuple()), [])
+    files_subset = [
+        os.path.join(dir_path, files[0]), os.path.join(dir_path, files[2])
+    ]
+    self.assertItemsEqual(
+        file_io.get_matching_files(files_subset), files_subset)
     file_io.delete_recursively(dir_path)
     self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index e03192508f718017abd8ba99b6b5f627c2fbaca9..d06eb77987149fb463d77cacc0322090d77d5d42 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -84,7 +84,6 @@ from __future__ import print_function
 
 import sys
 import numpy as np
-import six
 
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
@@ -470,7 +469,10 @@ def _SliceHelper(tensor, slice_spec, var=None):
     else:
       begin.append(s)
       end.append(s + 1)
-      strides.append(1)
+      if isinstance(s, ops.Tensor):
+        strides.append(constant(1, s.dtype))
+      else:
+        strides.append(np.ones_like(s).dtype.type(1))
       shrink_axis_mask |= (1 << index)
     index += 1
 
@@ -557,7 +559,13 @@ def strided_slice(input_,
                   shrink_axis_mask=0,
                   var=None,
                   name=None):
-  """Extracts a strided slice from a tensor.
+  """Extracts a strided slice of a tensor (generalized python array indexing).
+
+  **Most users will want to use @{tf.Tensor.__getitem__} and
+  @{tf.Variable.__getitem__}.** That allows  NumPy style slicing syntax (i.e.
+  `tensor[..., 3:4:-1, tf.newaxis, 3]`).
+  This op is the low-level interface that are used to implement operators.
+  Those interfaces are much more friendly, and highly recommended.
 
   To a first order, this operation extracts a slice of size `end - begin`
   from a tensor `input`
@@ -665,8 +673,7 @@ def strided_slice(input_,
     if name is None:
       name = parent_name + "_assign"
 
-    return gen_array_ops.strided_slice_assign(
-        ref=var,
+    return var._strided_slice_assign(
         begin=begin,
         end=end,
         strides=strides,
@@ -1156,13 +1163,14 @@ def sparse_mask(a, mask_indices, name=None):
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor into sub tensors.
 
-  If `num_or_size_splits` is a scalar, `num_split`, then splits `value` along
-  dimension `axis` into `num_split` smaller tensors.
+  If `num_or_size_splits` is an integer type, `num_split`, then splits `value`
+  along dimension `axis` into `num_split` smaller tensors.
   Requires that `num_split` evenly divides `value.shape[axis]`.
 
-  If `num_or_size_splits` is a tensor, `size_splits`, then splits `value` into
-  `len(size_splits)` pieces. The shape of the `i`-th piece has the same size as
-  the `value` except along dimension `axis` where the size is `size_splits[i]`.
+  If `num_or_size_splits` is not an integer type, it is presumed to be a Tensor
+  `size_splits`, then splits `value` into `len(size_splits)` pieces. The shape
+  of the `i`-th piece has the same size as the `value` except along dimension
+  `axis` where the size is `size_splits[i]`.
 
   For example:
 
@@ -1180,11 +1188,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   Args:
     value: The `Tensor` to split.
-    num_or_size_splits: Either an integer indicating the number of splits along
-      split_dim or a 1-D Tensor containing the sizes of each output tensor
-      along split_dim. If an integer then it must evenly divide
-      `value.shape[axis]`; otherwise the sum of sizes along the split
-      dimension must match that of the `value`.
+    num_or_size_splits: Either a 0-D integer `Tensor` indicating the number of
+      splits along split_dim or a 1-D integer `Tensor` integer tensor containing
+      the sizes of each output tensor along split_dim. If a scalar then it must
+      evenly divide `value.shape[axis]`; otherwise the sum of sizes along the
+      split dimension must match that of the `value`.
     axis: A 0-D `int32` `Tensor`. The dimension along which to split.
       Must be in the range `[0, rank(value))`. Defaults to 0.
     num: Optional, used to specify the number of outputs when it cannot be
@@ -1200,11 +1208,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   Raises:
     ValueError: If `num` is unspecified and cannot be inferred.
   """
-  if isinstance(num_or_size_splits, six.integer_types):
+  size_splits = ops.convert_to_tensor(num_or_size_splits)
+  if size_splits.get_shape().ndims == 0 and size_splits.dtype.is_integer:
     return gen_array_ops._split(
         split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
   else:
-    size_splits = ops.convert_to_tensor(num_or_size_splits)
     if num is None:
       size_splits_shape = size_splits.get_shape()
       num = size_splits_shape.dims[0]
@@ -1681,21 +1689,21 @@ def meshgrid(*args, **kwargs):
   results in
 
   ```prettyprint
-    X = [[1, 1, 1],
-         [2, 2, 2],
-         [3, 3, 3]]
-    Y = [[4, 5, 6],
-         [4, 5, 6],
-         [4, 5, 6]]
+    X = [[1, 2, 3],
+         [1, 2, 3],
+         [1, 2, 3]]
+    Y = [[4, 4, 4],
+         [5, 5, 5],
+         [6, 6, 6]]
   ```
 
   Args:
-    *args: `Tensor`s with rank 1
-    indexing: Either 'xy' or 'ij' (optional, default: 'xy')
+    *args: `Tensor`s with rank 1.
+    indexing: Either 'xy' or 'ij' (optional, default: 'xy').
     name: A name for the operation (optional).
 
   Returns:
-    outputs: A list of N `Tensor`s with rank N
+    outputs: A list of N `Tensor`s with rank N.
   """
 
   indexing = kwargs.pop("indexing", "xy")
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 285c199b10988af7de88d9eaccd5e927828a3468..3053a333bfcd38b4cc74bc509af3b2baffe5be43 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -53,7 +53,9 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample. The
+      `sampled_candidates` return value will have shape `[num_sampled]`. If
+      `unique=True`, `num_sampled` must be less than or equal to `range_max`.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -61,8 +63,10 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     name: A name for the operation (optional).
 
   Returns:
-    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.
-      The sampled classes.
+    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.  The
+      sampled classes, either with possible duplicates (`unique=False`) or all
+      unique (`unique=True`). In either case, `sampled_candidates` is
+      independent of the true classes.
     true_expected_count: A tensor of type `float`.  Same shape as
       `true_classes`. The expected counts under the sampling distribution
       of each of `true_classes`.
@@ -111,7 +115,7 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -166,7 +170,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -230,7 +234,7 @@ def fixed_unigram_candidate_sampler(true_classes,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 133528a1cd6e5388a2caf266899062d44d03be65..17c0aae4e24823fe2b3f59eb28c654bc9cf59bca 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -49,7 +49,7 @@ def _SwitchGrad(op, *grad):
       # This is the second time this Switch is visited. It comes from
       # the non-exit branch of the Switch, so update the second input
       # to the Merge.
-      # TODO: Perform shape inference with this new input.
+      # TODO(yuanbyu): Perform shape inference with this new input.
       if grad[1] is not None:
         # pylint: disable=protected-access
         control_flow_ops._AddNextAndBackEdge(merge_grad, grad[1])
@@ -162,11 +162,14 @@ def _ExitGrad(op, grad):
     dense_shape = grad.dense_shape
     if dense_shape is not None:
       grad_ctxt.AddName(dense_shape.name)
-  enter_fn = control_flow_ops._Enter  # pylint: disable=protected-access
   grad_ctxt.Enter()
-  result = enter_fn(grad, grad_ctxt.name, is_constant=False,
-                    parallel_iterations=grad_ctxt.parallel_iterations,
-                    name="b_exit")
+  # pylint: disable=protected-access
+  result = control_flow_ops._Enter(
+      grad, grad_ctxt.name, is_constant=False,
+      parallel_iterations=grad_ctxt.parallel_iterations,
+      name="b_exit")
+  # pylint: enable=protected-access
+  grad_ctxt.loop_enters.append(result)
   grad_ctxt.Exit()
   return result
 
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 93416140ffc8572f33527948d3c878c6dd33d5b7..96ace6e79b4502d94df32ba92fb70fea53458e28 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -50,6 +50,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -71,7 +73,9 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.gen_control_flow_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_should_use
 
 
 # We override the 'tuple' for a control flow op, so we keep python's
@@ -84,6 +88,7 @@ _basetuple = tuple
 
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
+@tf_should_use.should_use_result
 def Assert(condition, data, summarize=None, name=None):
   """Asserts that the given condition is true.
 
@@ -273,7 +278,7 @@ def exit(data, name=None):
 def switch(data, pred, dtype=None, name=None):
   """Forwards `data` to an output determined by `pred`.
 
-  If `pred` is false, the `data` input is forwared to the first output.
+  If `pred` is false, the `data` input is forwarded to the first output.
   Otherwise, the data goes to the second output.
 
   This op handles `Tensor`s and `IndexedSlices`.
@@ -332,7 +337,7 @@ def _SwitchRefOrTensor(data, pred, name="Switch"):
     name: A name for this operation (optional).
 
   Returns:
-    `(output_false, output_false)`: If `pred` is true, data will be forwarded to
+    `(output_false, output_true)`: If `pred` is true, data will be forwarded to
     `output_true`, otherwise it goes to `output_false`.
 
   Raises:
@@ -1046,15 +1051,19 @@ class ControlFlowState(object):
     """
     loop_exits = []
     for _, grad_state in self._map.items():
+      # pylint: disable=protected-access
       for y in grad_state.forward_loop_exits:
-        # pylint: disable=protected-access
         if pending_count[y.op._id] == 0:
           grad_state.pending_exits_count -= 1
           if y.op._id not in to_ops_set:
             grad_state.unused_exits.append(y)
           if grad_state.pending_exits_count == 0:
             loop_exits.extend(grad_state.unused_exits)
-        # pylint: enable=protected-access
+      # Need to include Enters in backprop for higher-order gradients.
+      for y in grad_state.forward_context.loop_enters:
+        if pending_count[y.op._id] == 0:
+          pending_count[y.op._id] = 1
+      # pylint: enable=protected-access
     return loop_exits
 
   def EnterGradWhileContext(self, op, before):
@@ -1301,11 +1310,15 @@ def ZerosLikeOutsideLoop(op, index):
     return array_ops.zeros_like(val, optimize=False)
   else:
     op_ctxt = op._get_control_flow_context()
-    pred = op_ctxt.pred
-    branch = op_ctxt.branch
-    switch_val = switch(op.inputs[0], pred)[1 - branch]
-    zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
-    return array_ops.zeros(zeros_shape, dtype=val.dtype)
+    if op_ctxt:
+      # We are in a cond context. Use a switch to create zeros only when needed.
+      pred = op_ctxt.pred
+      branch = op_ctxt.branch
+      switch_val = switch(op.inputs[0], pred)[1 - branch]
+      zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
+      return array_ops.zeros(zeros_shape, dtype=val.dtype)
+    else:
+      return array_ops.zeros_like(val, optimize=False)
 
 
 class ControlFlowContext(object):
@@ -1452,6 +1465,14 @@ class ControlFlowContext(object):
     return internal_control_inputs
   # pylint: enable=protected-access
 
+  def AddInnerOp(self, op):
+    """Notifies a scope about an operator added to an inner scope."""
+    pass
+
+  def GetControlPivot(self):
+    """Returns the pivot node for this context, or None."""
+    return None
+
 
 class CondContext(ControlFlowContext):
   """The context for the conditional construct."""
@@ -1615,6 +1636,11 @@ class CondContext(ControlFlowContext):
           # pylint: enable=protected-access
       for x in op.outputs:
         self._values.add(x.name)
+      # pylint: disable=protected-access
+      if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
+        op._add_control_input(self._pivot.op)
+      # pylint: enable=protected-access
+
     if self._outer_context or not IsLoopExit(op):
       op.graph.prevent_fetching(op)
 
@@ -1673,14 +1699,20 @@ def _UnpackIfSingleton(res):
     return res
 
 
-def cond(pred, fn1, fn2, strict=False, name=None):
-  """Return either `fn1()` or `fn2()` based on the boolean predicate `pred`.
+# pylint: disable=g-doc-args
+@deprecation.deprecated_args(
+    None,
+    "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
+    "fn1", "fn2")
+def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
+         fn1=None, fn2=None):
+  """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
-  `fn1` and `fn2` both return lists of output tensors. `fn1` and `fn2` must have
-  the same non-zero number and type of outputs.
+  `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
+  `false_fn` must have the same non-zero number and type of outputs.
 
   Note that the conditional execution applies only to the operations defined in
-  `fn1` and `fn2`. Consider the following simple program:
+  `true_fn` and `false_fn`. Consider the following simple program:
 
   ```python
   z = tf.multiply(a, b)
@@ -1694,28 +1726,35 @@ def cond(pred, fn1, fn2, strict=False, name=None):
   Although this behavior is consistent with the dataflow model of TensorFlow,
   it has occasionally surprised some users who expected a lazier semantics.
 
+  Note that `cond` calls `true_fn` and `false_fn` *exactly once* (inside the
+  call to `cond`, and not at all during `Session.run()`). `cond`
+  stitches together the graph fragments created during the `true_fn` and
+  `false_fn` calls with some additional graph nodes to ensure that the right
+  branch gets executed depending on the value of `pred`.
+
   `tf.cond` supports nested structures as implemented in
-  `tensorflow.python.util.nest`. Both `fn1` and `fn2` must return the same
-  (possibly nested) value structure of lists, tuples, and/or named tuples.
+  `tensorflow.python.util.nest`. Both `true_fn` and `false_fn` must return the
+  same (possibly nested) value structure of lists, tuples, and/or named tuples.
   Singleton lists and tuples form the only exceptions to this: when returned by
-  `fn1` and/or `fn2`, they are implicitly unpacked to single values. This
-  behavior is disabled by passing `strict=True`.
+  `true_fn` and/or `false_fn`, they are implicitly unpacked to single values.
+  This behavior is disabled by passing `strict=True`.
 
   Args:
-    pred: A scalar determining whether to return the result of `fn1` or `fn2`.
-    fn1: The callable to be performed if pred is true.
-    fn2: The callable to be performed if pred is false.
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
     strict: A boolean that enables/disables 'strict' mode; see above.
     name: Optional name prefix for the returned tensors.
 
   Returns:
-    Tensors returned by the call to either `fn1` or `fn2`. If the callables
-    return a singleton list, the element is extracted from the list.
+    Tensors returned by the call to either `true_fn` or `false_fn`. If the
+    callables return a singleton list, the element is extracted from the list.
 
   Raises:
-    TypeError: if `fn1` or `fn2` is not callable.
-    ValueError: if `fn1` and `fn2` do not return the same number of tensors, or
-                return tensors of different types.
+    TypeError: if `true_fn` or `false_fn` is not callable.
+    ValueError: if `true_fn` and `false_fn` do not return the same number of
+      tensors, or return tensors of different types.
 
   Example:
 
@@ -1730,12 +1769,30 @@ def cond(pred, fn1, fn2, strict=False, name=None):
   ```
 
   """
-  with ops.name_scope(name, "cond", [pred]) as name:
-    if not callable(fn1):
-      raise TypeError("fn1 must be callable.")
-    if not callable(fn2):
-      raise TypeError("fn2 must be callable.")
+  # We needed to make true_fn/false_fn keyword arguments for
+  # backwards-compatibility. This check exists so that we can convert back to
+  # having them be positional arguments.
+  # TODO(josh11b): Make `true_fn` and `false_fn` positional arguments after
+  # `fn1` and `fn2` are deleted.
+  if fn1 is not None:
+    if true_fn is not None:
+      raise TypeError("cond(): true_fn and fn1 may not be set simultaneously.")
+    true_fn = fn1
+  elif true_fn is None:
+    raise TypeError("cond(): true_fn argument required")
+  if fn2 is not None:
+    if false_fn is not None:
+      raise TypeError("cond(): false_fn and fn2 may not be set simultaneously.")
+    false_fn = fn2
+  elif false_fn is None:
+    raise TypeError("cond(): false_fn argument required")
+
+  if not callable(true_fn):
+    raise TypeError("true_fn must be callable.")
+  if not callable(false_fn):
+    raise TypeError("false_fn must be callable.")
 
+  with ops.name_scope(name, "cond", [pred]) as name:
     # Add the Switch to the graph.
     if isinstance(pred, bool):
       raise TypeError("pred must not be a Python bool")
@@ -1750,18 +1807,18 @@ def cond(pred, fn1, fn2, strict=False, name=None):
     # Build the graph for the true branch in a new context.
     context_t = CondContext(pred, pivot_1, branch=1)
     context_t.Enter()
-    orig_res_t, res_t = context_t.BuildCondBranch(fn1)
+    orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
     if orig_res_t is None:
-      raise ValueError("fn1 must have a return value.")
+      raise ValueError("true_fn must have a return value.")
     context_t.ExitResult(res_t)
     context_t.Exit()
 
     # Build the graph for the false branch in a new context.
     context_f = CondContext(pred, pivot_2, branch=0)
     context_f.Enter()
-    orig_res_f, res_f = context_f.BuildCondBranch(fn2)
+    orig_res_f, res_f = context_f.BuildCondBranch(false_fn)
     if orig_res_f is None:
-      raise ValueError("fn2 must have a return value.")
+      raise ValueError("false_fn must have a return value.")
     context_f.ExitResult(res_f)
     context_f.Exit()
 
@@ -1774,14 +1831,14 @@ def cond(pred, fn1, fn2, strict=False, name=None):
       nest.assert_same_structure(orig_res_t, orig_res_f)
     except TypeError as e:
       raise TypeError(
-          "Incompatible return types of fn1 and fn2: {}".format(e))
+          "Incompatible return types of true_fn and false_fn: {}".format(e))
     except ValueError as e:
       raise ValueError(
-          "Incompatible return values of fn1 and fn2: {}".format(e))
+          "Incompatible return values of true_fn and false_fn: {}".format(e))
 
     # Add the final merge to the graph.
     if not res_t:
-      raise ValueError("fn1 and fn2 must return at least one result.")
+      raise ValueError("true_fn and false_fn must return at least one result.")
 
     res_t_flat = nest.flatten(res_t)
     res_f_flat = nest.flatten(res_f)
@@ -1795,8 +1852,9 @@ def cond(pred, fn1, fn2, strict=False, name=None):
       val_x = x if isinstance(x, ops.Tensor) else x.values
       val_y = y if isinstance(y, ops.Tensor) else y.values
       if val_x.dtype.base_dtype != val_y.dtype.base_dtype:
-        raise ValueError("Outputs of fn1 and fn2 must have the same type: "
-                         "%s, %s" % (val_x.dtype.name, val_y.dtype.name))
+        raise ValueError(
+            "Outputs of true_fn and false_fn must have the same type: %s, %s" %
+            (val_x.dtype.name, val_y.dtype.name))
 
     merges = [merge(pair)[0] for pair in zip(res_f_flat, res_t_flat)]
     merges = _convert_flows_to_tensorarrays(nest.flatten(orig_res_t), merges)
@@ -1811,6 +1869,7 @@ def cond(pred, fn1, fn2, strict=False, name=None):
     if not strict:
       merges = _UnpackIfSingleton(merges)
     return merges
+# pylint: enable=g-doc-args
 
 
 def _resource_safe_shape(t):
@@ -1882,6 +1941,8 @@ class WhileContext(ControlFlowContext):
     self._pivot = None
     # The list of exit tensors for loop variables.
     self._loop_exits = []
+    # The list of enter tensors for loop variables.
+    self._loop_enters = []
 
   def _init_from_proto(self, context_def, import_scope=None):
     """Creates a new `WhileContext` from protocol buffer.
@@ -1911,6 +1972,10 @@ class WhileContext(ControlFlowContext):
     self._loop_exits = [g.as_graph_element(
         ops.prepend_name_scope(exit_name, import_scope))
                         for exit_name in context_def.loop_exit_names]
+    # The list of enter tensors for loop variables.
+    self._loop_enters = [g.as_graph_element(
+        ops.prepend_name_scope(enter_name, import_scope))
+                         for enter_name in context_def.loop_enter_names]
     super(WhileContext, self).__init__(values_def=context_def.values_def,
                                        import_scope=import_scope)
 
@@ -1938,6 +2003,11 @@ class WhileContext(ControlFlowContext):
     """The boolean tensor representing the loop termination condition."""
     return self._pivot
 
+  @property
+  def loop_enters(self):
+    """The list of enter tensors for loop variables."""
+    return self._loop_enters
+
   @property
   def loop_exits(self):
     """The list of exit tensors for loop variables."""
@@ -1971,10 +2041,12 @@ class WhileContext(ControlFlowContext):
           self._pivot_for_body.name, export_scope)
       context_def.pivot_name = ops.strip_name_scope(
           self._pivot.name, export_scope)
-      if self._loop_exits:
-        context_def.loop_exit_names.extend(
-            [ops.strip_name_scope(l.name, export_scope)
-             for l in self._loop_exits])
+      context_def.loop_exit_names.extend(
+          [ops.strip_name_scope(l.name, export_scope)
+           for l in self._loop_exits])
+      context_def.loop_enter_names.extend(
+          [ops.strip_name_scope(l.name, export_scope)
+           for l in self._loop_enters])
       context_def.values_def.MergeFrom(
           super(WhileContext, self)._to_proto(
               export_scope=export_scope))
@@ -2035,6 +2107,8 @@ class WhileContext(ControlFlowContext):
         enter = _Enter(result, self._name, is_constant=True,
                        parallel_iterations=self._parallel_iterations)
         enter.graph.prevent_feeding(enter)
+        if self._outer_context:
+          self._outer_context.AddInnerOp(enter.op)
       # Fix the control inputs and control flow context of these enter ops.
       self._FixControlInputsAndContext([enter])
 
@@ -2104,11 +2178,19 @@ class WhileContext(ControlFlowContext):
       for x in op.outputs:
         op.graph.prevent_feeding(x)
 
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
   def _MaybeAddControlDependency(self, op):
     """Add a control input to the op if it only depends on loop invariants."""
     def _IsOpFree(op):
+      """Determines if `op` needs a control dependency."""
       if op.control_inputs:
         return False
+      # pylint: disable=protected-access
+      if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
+        return True
+      # pylint: enable=protected-access
       for x in op.inputs:
         if not _IsLoopConstantEnter(x.op):
           return False
@@ -2149,6 +2231,8 @@ class WhileContext(ControlFlowContext):
     enter_n = _Enter(n, self._name, is_constant=False,
                      parallel_iterations=self._parallel_iterations,
                      name="f_count")
+    self.loop_enters.append(enter_n)
+
     merge_n = merge([enter_n, enter_n])[0]
     switch_n = switch(merge_n, self._pivot)
 
@@ -2189,6 +2273,8 @@ class WhileContext(ControlFlowContext):
     enter_count = _Enter(count, self._name, is_constant=False,
                          parallel_iterations=self._parallel_iterations,
                          name="b_count")
+    self.loop_enters.append(enter_count)
+
     merge_count = merge([enter_count, enter_count])[0]
     self._pivot_for_pred = merge_count
 
@@ -2276,6 +2362,8 @@ class WhileContext(ControlFlowContext):
     enter_acc = _Enter(acc, self._name, is_constant=False,
                        parallel_iterations=self._parallel_iterations,
                        name="b_acc")
+    self.loop_enters.append(enter_acc)
+
     merge_acc = merge([enter_acc, enter_acc], name="b_acc")[0]
     switch_acc_false, switch_acc_true = switch(merge_acc, self._pivot)
 
@@ -2283,10 +2371,10 @@ class WhileContext(ControlFlowContext):
     next_acc = _NextIteration(add_acc)
     merge_acc.op._update_input(1, next_acc)  # pylint: disable=protected-access
 
-    acc_result = exit(switch_acc_false, name="b_acc")
-    self.loop_exits.append(acc_result)
-    self.ExitResult([acc_result])
-    return acc_result
+    result_acc = exit(switch_acc_false, name="b_acc")
+    self.loop_exits.append(result_acc)
+    self.ExitResult([result_acc])
+    return result_acc
 
   def AddBackPropIndexedSlicesAccumulator(self, op, grad):
     """This is used for accumulating gradients that are IndexedSlices.
@@ -2343,6 +2431,8 @@ class WhileContext(ControlFlowContext):
     enter_acc = [_Enter(x, self._name, is_constant=False,
                         parallel_iterations=self._parallel_iterations,
                         name="b_acc") for x in init_acc]
+    self.loop_enters.extend(enter_acc)
+
     merge_acc = [merge([x, x], name="b_acc")[0] for x in enter_acc]
     switch_acc = [switch(x, self._pivot) for x in merge_acc]
 
@@ -2360,13 +2450,13 @@ class WhileContext(ControlFlowContext):
     for xm, xn in zip(merge_acc, next_acc):
       xm.op._update_input(1, xn)  # pylint: disable=protected-access
 
-    acc_exits = [exit(x[0], name="b_acc") for x in switch_acc]
-    self.loop_exits.extend(acc_exits)
+    exit_acc = [exit(x[0], name="b_acc") for x in switch_acc]
+    self.loop_exits.extend(exit_acc)
 
-    self.ExitResult(acc_exits)
+    self.ExitResult(exit_acc)
     return ops.IndexedSlices(
-        indices=acc_exits[0], values=acc_exits[1],
-        dense_shape=acc_exits[2] if shape_acc is not None else None)
+        indices=exit_acc[0], values=exit_acc[1],
+        dense_shape=exit_acc[2] if shape_acc is not None else None)
 
   def _InitializeValues(self, values):
     """Makes the values known to this context."""
@@ -2404,19 +2494,30 @@ class WhileContext(ControlFlowContext):
                     for x in real_vars]
       for x in enter_vars:
         x.graph.prevent_feeding(x)
+        if self._outer_context:
+          self._outer_context.AddInnerOp(x.op)
+
+    # Finds the closest enclosing non-None control pivot.
+    outer_context = self._outer_context
+    control_pivot = None
+    while outer_context is not None and control_pivot is None:
+      control_pivot = outer_context.GetControlPivot()
+      # pylint: disable=protected-access
+      outer_context = outer_context._outer_context
+      # pylint: enable=protected-access
 
-    if self._outer_context:
-      control_pivot = self._outer_context.GetControlPivot().op
+    if control_pivot is not None:
       for var in enter_vars:
         if _IsLoopConstantEnter(var.op.inputs[0].op):
           # pylint: disable=protected-access
-          var.op._add_control_input(control_pivot)
+          var.op._add_control_input(control_pivot.op)
           # pylint: enable=protected-access
     _SetShapeInvariants(real_vars, enter_vars, shape_invariants)
 
     # Fix the control inputs and control flow context of these enter ops.
     self._FixControlInputsAndContext(enter_vars)
     self._InitializeValues(enter_vars)
+    self._loop_enters = enter_vars
 
     merge_vars = [merge([x, x])[0] for x in enter_vars]
     self._pivot_for_pred = merge_vars[0]
@@ -2542,12 +2643,16 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
   `cond` and `body`. `cond` and `body` both take as many arguments as there are
   `loop_vars`.
 
-  While `cond` evaluates to true, `body` is executed.
-
   In addition to regular Tensors or IndexedSlices, the body may accept and
   return TensorArray objects.  The flows of the TensorArray objects will
   be appropriately forwarded between loops and during gradient calculations.
 
+  Note that `while_loop` calls `cond` and `body` *exactly once* (inside the
+  call to `while_loop`, and not at all during `Session.run()`). `while_loop`
+  stitches together the graph fragments created during the `cond` and `body`
+  calls with some additional graph nodes to make something the repeats
+  `body` until `cond` returns false.
+
   For correctness, `tf.while_loop()` strictly enforces shape invariants for
   the loop variables. A shape invariant is a (possibly partial) shape that
   is unchanged across the iterations of the loop. An error will be raised
@@ -2876,12 +2981,17 @@ def case(pred_fn_pairs, default, exclusive=False, strict=False, name="case"):
   operation returns the tensors generated by `default`.
 
   `tf.case` supports nested structures as implemented in
-  `tensorflow.python.util.nest`. Both `fn1` and `fn2` must return the same
+  `tensorflow.python.util.nest`. All of the callables must return the same
   (possibly nested) value structure of lists, tuples, and/or named tuples.
   Singleton lists and tuples form the only exceptions to this: when returned by
-  `fn1` and/or `fn2`, they are implicitly unpacked to single values. This
+  a callable, they are implicitly unpacked to single values. This
   behavior is disabled by passing `strict=True`.
 
+  If an unordered dictionary is used for `pred_fn_pairs`, the order of the
+  conditional tests is not guaranteed. However, the order is guaranteed to be
+  deterministic, so that variables created in conditional branches are created
+  in fixed order across runs.
+
   Example 1:
     Pseudocode:
     ```
@@ -2907,9 +3017,6 @@ def case(pred_fn_pairs, default, exclusive=False, strict=False, name="case"):
 
     Expressions:
     ```
-      x = tf.constant(0)
-      y = tf.constant(1)
-      z = tf.constant(2)
       def f1(): return tf.constant(17)
       def f2(): return tf.constant(23)
       def f3(): return tf.constant(-1)
@@ -2940,11 +3047,14 @@ def case(pred_fn_pairs, default, exclusive=False, strict=False, name="case"):
           or isinstance(pfp, dict)):
     raise TypeError("fns must be a list, tuple, or dict")
   if isinstance(pfp, dict):
-    pfp = pfp.items()
-    if not exclusive:
-      logging.warn("%s: Provided dictionary of predicate/fn pairs, but "
-                   "exclusive=False.  Order of conditional tests is "
-                   "not guaranteed.", name)
+    if isinstance(pfp, collections.OrderedDict):
+      pfp = pfp.items()
+    else:
+      pfp = sorted(pfp.items(), key=lambda item: item[0].name)
+      if not exclusive:
+        logging.warn("%s: An unordered dictionary of predicate/fn pairs was "
+                     "provided, but exclusive=False. The order of conditional "
+                     "tests is deterministic but not guaranteed.", name)
   for tup in pfp:
     if not isinstance(tup, _basetuple) or len(tup) != 2:
       raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 21a5afabe0b4032a2a4f95a6008ca42a9a86d87a..4e95783e5a81f01499bb3d164683d34de258b9b9 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -313,6 +313,79 @@ class SwitchTestCase(TensorFlowTestCase):
         self.assertEquals(o, 6)
         self.assertAllEqual(grad, [1] * 3)
 
+  def testGradientThroughSingleBranchOutsideOfContext(self):
+    with self.test_session():
+      x = constant_op.constant(2.)
+      s = constant_op.constant(True)
+      x_false, x_true = control_flow_ops.switch(x, s)
+      grad_x_true = gradients_impl.gradients(x_true, x)[0]
+      grad_x_false = gradients_impl.gradients(x_false, x)[0]
+      self.assertEquals(grad_x_true.eval(), 1.)
+      self.assertEquals(grad_x_false.eval(), 0.)
+
+
+class CondTest(TensorFlowTestCase):
+
+  def testCondTrue(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(5)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
+          lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 34)
+
+  def testCondFalse(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(1)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
+          lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 24)
+
+  def testCondTrueLegacy(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(5)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
+          fn2=lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 34)
+
+  def testCondFalseLegacy(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(1)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
+          fn2=lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 24)
+
+  def testCondMissingArg1(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, false_fn=lambda: x)
+
+  def testCondMissingArg2(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, lambda: x)
+
+  def testCondDuplicateArg1(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
+
+  def testCondDuplicateArg2(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
+
 
 class ContextTest(TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index bd23a66e0c02433d8f4e3e12195e2fce731eb502..4ea4d9ed2dda15f032ffdb3ef91ff8dc66d82e00 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -30,16 +30,15 @@ from tensorflow.python.ops.nn_grad import _BroadcastMul
 # pylint: disable=protected-access, invalid-name
 def ctc_loss(labels, inputs, sequence_length,
              preprocess_collapse_repeated=False,
-             ctc_merge_repeated=True, time_major=True):
+             ctc_merge_repeated=True,
+             ignore_longer_outputs_than_inputs=False, time_major=True):
   """Computes the CTC (Connectionist Temporal Classification) Loss.
 
   This op implements the CTC loss as presented in the article:
 
-  A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
   Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
-  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.
-
-  http://www.cs.toronto.edu/~graves/icml_2006.pdf
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
 
   Input requirements:
 
@@ -96,6 +95,11 @@ def ctc_loss(labels, inputs, sequence_length,
 
     Untested.  Very likely will not learn to output repeated classes.
 
+  The `ignore_longer_outputs_than_inputs` option allows to specify the behavior
+  of the CTCLoss when dealing with sequences that have longer outputs than
+  inputs. If true, the CTCLoss will simply return zero gradient for those
+  items, otherwise an InvalidArgument error is returned, stopping training.
+
   Args:
     labels: An `int32` `SparseTensor`.
       `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
@@ -113,6 +117,8 @@ def ctc_loss(labels, inputs, sequence_length,
     preprocess_collapse_repeated: Boolean.  Default: False.
       If True, repeated labels are collapsed prior to the CTC calculation.
     ctc_merge_repeated: Boolean.  Default: True.
+    ignore_longer_outputs_than_inputs: Boolean. Default: False.
+      If True, sequences with longer outputs than inputs will be ignored.
     time_major: The shape format of the `inputs` Tensors.
       If True, these `Tensors` must be shaped `[max_time, batch_size, num_classes]`.
       If False, these `Tensors` must be shaped `[batch_size, max_time, num_classes]`.
@@ -142,7 +148,8 @@ def ctc_loss(labels, inputs, sequence_length,
       labels.values,
       sequence_length,
       preprocess_collapse_repeated=preprocess_collapse_repeated,
-      ctc_merge_repeated=ctc_merge_repeated)
+      ctc_merge_repeated=ctc_merge_repeated,
+      ignore_longer_outputs_than_inputs=ignore_longer_outputs_than_inputs)
 
   return loss
 
@@ -207,7 +214,8 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
       `decoded.shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length]`
     neg_sum_logits: A `float` matrix `(batch_size x 1)` containing, for the
-        sequence found, the negative of the sum of the greatest logit at each timeframe.
+        sequence found, the negative of the sum of the greatest logit at each
+        timeframe.
   """
   outputs = gen_ctc_ops._ctc_greedy_decoder(
       inputs, sequence_length, merge_repeated=merge_repeated)
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 6395451e2ae860ae1098b714306d315de9c010bf..9a208613add876898751ad18d31abd2cfe864dc5 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 
 import collections
 import hashlib
-import re
 import threading
 
 import six
@@ -39,7 +38,6 @@ from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_data_flow_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.deprecation import deprecated
 
 
 def _as_type_list(dtypes):
@@ -56,6 +54,7 @@ def _as_type_list(dtypes):
 def _as_shape_list(shapes, dtypes, unknown_dim_allowed=False,
                    unknown_rank_allowed=False):
   """Convert shapes to a list of tuples of int (or None)."""
+  del dtypes
   if unknown_dim_allowed:
     if (not isinstance(shapes, collections.Sequence)
         or not shapes
@@ -925,16 +924,18 @@ class Barrier(object):
     If barrier has no completed elements, this operation will block
     until there are 'num_elements' elements to take.
 
+    TODO(b/25743580): the semantics of `allow_small_batch` are experimental
+    and may be extended to other cases in the future.
+
+    TODO(ebrevdo): If a take_many(allow_small_batch=True) is blocking
+    already when the barrier is closed, it will block for ever. Fix this
+    by using asynchronous operations.
+
     Args:
       num_elements: The number of elements to take.
       allow_small_batch: If the barrier is closed, don't block if there are less
         completed elements than requested, but instead return all available
         completed elements.
-        TODO(b/25743580): the semantics of `allow_small_batch` are experimental
-        and may be extended to other cases in the future.
-        TODO(ebrevdo): If a take_many(allow_small_batch=True) is blocking
-        already when the barrier is closed, it will block for ever. Fix this
-        by using asynchronous operations.
       timeout: This specifies the number of milliseconds to block
         before returning with DEADLINE_EXCEEDED. (This option is not
         supported yet.)
@@ -1035,47 +1036,6 @@ class Barrier(object):
         self._barrier_ref, name=name)
 
 
-@deprecated("2017-03-02", "Use `tf.tables_initializer` instead.")
-def initialize_all_tables(name="init_all_tables"):
-  """Returns an Op that initializes all tables of the default graph.
-
-  Args:
-    name: Optional name for the initialization op.
-
-  Returns:
-    An Op that initializes all tables.  Note that if there are
-    not tables the returned Op is a NoOp.
-  """
-  return tables_initializer(name)
-
-
-def tables_initializer(name="init_all_tables"):
-  """Returns an Op that initializes all tables of the default graph.
-
-  Args:
-    name: Optional name for the initialization op.
-
-  Returns:
-    An Op that initializes all tables.  Note that if there are
-    not tables the returned Op is a NoOp.
-  """
-  initializers = ops.get_collection(ops.GraphKeys.TABLE_INITIALIZERS)
-  if initializers:
-    return control_flow_ops.group(*initializers, name=name)
-  return control_flow_ops.no_op(name=name)
-
-
-ops.NotDifferentiable("LookupTableFind")
-ops.NotDifferentiable("LookupTableInsert")
-ops.NotDifferentiable("LookupTableSize")
-ops.NotDifferentiable("HashTable")
-ops.NotDifferentiable("InitializeTable")
-ops.NotDifferentiable("InitializeTableFromTextFile")
-ops.NotDifferentiable("MutableDenseHashTable")
-ops.NotDifferentiable("MutableHashTable")
-ops.NotDifferentiable("MutableHashTableOfTensors")
-
-
 class ConditionalAccumulatorBase(object):
   """A conditional accumulator for aggregating gradients.
 
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..833239eb5fa7ad68b3cd2cc5d2346f060a1727ca
--- /dev/null
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -0,0 +1,42 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "distributions",
+    srcs = glob(["*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/ops/distributions/__init__.py b/tensorflow/python/ops/distributions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..563b189990cfed5d6418c7cfca6c0fdf4226995f
--- /dev/null
+++ b/tensorflow/python/ops/distributions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Core module for TensorFlow distribution objects and helpers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/distributions/python/ops/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
similarity index 97%
rename from tensorflow/contrib/distributions/python/ops/bernoulli.py
rename to tensorflow/python/ops/distributions/bernoulli.py
index c491cb5d42a06f6726f8b13d0e509d47c61a53dd..3281b57e83e374ddae9ac9cb1d4ef0154c12f836 100644
--- a/tensorflow/contrib/distributions/python/ops/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -18,9 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -30,6 +27,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Bernoulli(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/beta.py b/tensorflow/python/ops/distributions/beta.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/beta.py
rename to tensorflow/python/ops/distributions/beta.py
index 463808ea9af800515691e6bf7e7226b44a5ce68c..2b93478cdf9f9e80f4c2c19ad25cb270a8e7aa98 100644
--- a/tensorflow/contrib/distributions/python/ops/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +30,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/bijector.py b/tensorflow/python/ops/distributions/bijector.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/ops/bijectors/bijector.py
rename to tensorflow/python/ops/distributions/bijector.py
index b0727cd8f36b8e954bc57897636181e47ead6e1e..70e9fdadd20e42b5618a23f4b03aa24decd267ba 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/bijector.py
+++ b/tensorflow/python/ops/distributions/bijector.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_impl import *
+from tensorflow.python.ops.distributions.bijector_impl import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/bijectors/bijector_impl.py
rename to tensorflow/python/ops/distributions/bijector_impl.py
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py b/tensorflow/python/ops/distributions/bijector_test_util.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py
rename to tensorflow/python/ops/distributions/bijector_test_util.py
index a0834423329da7bd512c3c825c888185f66af6bf..ff3535c62642d98bdd9b18808f45deae27d6d88d 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py
+++ b/tensorflow/python/ops/distributions/bijector_test_util.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import uniform as uniform_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform as uniform_lib
 
 
 def assert_finite(array):
diff --git a/tensorflow/contrib/distributions/python/ops/categorical.py b/tensorflow/python/ops/distributions/categorical.py
similarity index 97%
rename from tensorflow/contrib/distributions/python/ops/categorical.py
rename to tensorflow/python/ops/distributions/categorical.py
index abdb94b3e9c18098e5a70a668e0ad3586a6cc6b7..1b74c2f0ca7b0fa4935401790f71c363e9748295 100644
--- a/tensorflow/contrib/distributions/python/ops/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -18,9 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,6 +26,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Categorical(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/dirichlet.py
rename to tensorflow/python/ops/distributions/dirichlet.py
index c524f322b0d67f858c29b318839aacd96504f322..923696a553caae80592be65f7ffeecb3f9373bb0 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -29,6 +27,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
similarity index 99%
rename from tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
rename to tensorflow/python/ops/distributions/dirichlet_multinomial.py
index e647a4981ca6444c0c7e0404d181b1a2c4438229..662a7655584b8dc6aeed5251f98dd17fb24f3606 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -28,6 +26,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/distribution.py b/tensorflow/python/ops/distributions/distribution.py
similarity index 99%
rename from tensorflow/contrib/distributions/python/ops/distribution.py
rename to tensorflow/python/ops/distributions/distribution.py
index 0b7ffbd792e36e0ba67b01ef636632eb1b1ee452..a0be433a616103fc9525c157494629044704ec02 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -25,13 +25,13 @@ import types
 import numpy as np
 import six
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util
 from tensorflow.python.util import tf_inspect
 
 
@@ -241,7 +241,7 @@ class Distribution(_BaseDistribution):
   docstrings for their method specializations. For example:
 
   ```python
-  @distribution_util.AppendDocstring("Some other details.")
+  @util.AppendDocstring("Some other details.")
   def _log_prob(self, value):
     ...
   ```
@@ -1033,10 +1033,9 @@ class Distribution(_BaseDistribution):
     if ndims is None:
       # Maybe expand_dims.
       ndims = array_ops.rank(x)
-      expanded_shape = distribution_util.pick_vector(
+      expanded_shape = util.pick_vector(
           math_ops.equal(ndims, 0),
-          np.array([1], dtype=np.int32),
-          array_ops.shape(x))
+          np.array([1], dtype=np.int32), array_ops.shape(x))
       x = array_ops.reshape(x, expanded_shape)
     elif ndims == 0:
       # Definitely expand_dims.
diff --git a/tensorflow/contrib/distributions/python/ops/exponential.py b/tensorflow/python/ops/distributions/exponential.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/exponential.py
rename to tensorflow/python/ops/distributions/exponential.py
index a293d1e0dc27ece2c9bd6c326674e2b2414b675a..281641b9156b9631199efc78ea1c2d30119dadb8 100644
--- a/tensorflow/contrib/distributions/python/ops/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import gamma
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import gamma
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/gamma.py b/tensorflow/python/ops/distributions/gamma.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/gamma.py
rename to tensorflow/python/ops/distributions/gamma.py
index f46e2116e107da5bd418507cde565242d16e8e6b..4ac2b9b4ef894fd9a603ff67bf9c8754f1e23b8e 100644
--- a/tensorflow/contrib/distributions/python/ops/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +30,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/identity_impl.py b/tensorflow/python/ops/distributions/identity_bijector.py
similarity index 95%
rename from tensorflow/contrib/distributions/python/ops/bijectors/identity_impl.py
rename to tensorflow/python/ops/distributions/identity_bijector.py
index 9438a5226cd83142b460a99ea7899a46fea73a16..f277eda8bbfb88f2344dfd620c573e0acd8d8078 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/identity_impl.py
+++ b/tensorflow/python/ops/distributions/identity_bijector.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
similarity index 82%
rename from tensorflow/contrib/distributions/python/ops/kullback_leibler.py
rename to tensorflow/python/ops/distributions/kullback_leibler.py
index 335fe7a5e2ada8e12f5f0e0bc7806154554df873..9770d82bd8398a9f6d88c4360b77a7a691e72e5a 100644
--- a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -44,11 +44,13 @@ def _registered_kl(type_a, type_b):
   return kl_fn
 
 
-def kl(dist_a, dist_b, allow_nan_stats=True, name=None):
-  """Get the KL-divergence KL(dist_a || dist_b).
+def kl_divergence(distribution_a, distribution_b,
+                  allow_nan_stats=True, name=None):
+  """Get the KL-divergence KL(distribution_a || distribution_b).
 
-  If there is no KL method registered specifically for `type(dist_a)` and
-  `type(dist_b)`, then the class hierarchies of these types are searched.
+  If there is no KL method registered specifically for `type(distribution_a)`
+  and `type(distribution_b)`, then the class hierarchies of these types are
+  searched.
 
   If one KL method is registered between any pairs of classes in these two
   parent hierarchies, it is used.
@@ -58,11 +60,11 @@ def kl(dist_a, dist_b, allow_nan_stats=True, name=None):
 
   If more than one such shortest path exists, the first method
   identified in the search is used (favoring a shorter MRO distance to
-  `type(dist_a)`).
+  `type(distribution_a)`).
 
   Args:
-    dist_a: The first distribution.
-    dist_b: The second distribution.
+    distribution_a: The first distribution.
+    distribution_b: The second distribution.
     allow_nan_stats: Python `bool`, default `True`. When `True`,
       statistics (e.g., mean, mode, variance) use the value "`NaN`" to
       indicate the result is undefined. When `False`, an exception is raised
@@ -70,20 +72,22 @@ def kl(dist_a, dist_b, allow_nan_stats=True, name=None):
     name: Python `str` name prefixed to Ops created by this class.
 
   Returns:
-    A Tensor with the batchwise KL-divergence between dist_a and dist_b.
+    A Tensor with the batchwise KL-divergence between `distribution_a`
+    and `distribution_b`.
 
   Raises:
     NotImplementedError: If no KL method is defined for distribution types
-      of dist_a and dist_b.
+      of `distribution_a` and `distribution_b`.
   """
-  kl_fn = _registered_kl(type(dist_a), type(dist_b))
+  kl_fn = _registered_kl(type(distribution_a), type(distribution_b))
   if kl_fn is None:
     raise NotImplementedError(
-        "No KL(dist_a || dist_b) registered for dist_a type %s and dist_b "
-        "type %s" % (type(dist_a).__name__, type(dist_b).__name__))
+        "No KL(distribution_a || distribution_b) registered for distribution_a "
+        "type %s and distribution_b type %s"
+        % (type(distribution_a).__name__, type(distribution_b).__name__))
 
   with ops.name_scope("KullbackLeibler"):
-    kl_t = kl_fn(dist_a, dist_b, name=name)
+    kl_t = kl_fn(distribution_a, distribution_b, name=name)
     if allow_nan_stats:
       return kl_t
 
@@ -96,7 +100,7 @@ def kl(dist_a, dist_b, allow_nan_stats=True, name=None):
                 math_ops.reduce_any(math_ops.is_nan(kl_t))),
             ["KL calculation between %s and %s returned NaN values "
              "(and was called with allow_nan_stats=False). Values:"
-             % (dist_a.name, dist_b.name), kl_t])]):
+             % (distribution_a.name, distribution_b.name), kl_t])]):
       return array_ops.identity(kl_t, name="checked_kl")
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/laplace.py b/tensorflow/python/ops/distributions/laplace.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/laplace.py
rename to tensorflow/python/ops/distributions/laplace.py
index eff4f5f9b8906385c2b8635c97eeccb0b08e9e68..5c964ff78a53b6d2dec588b85abff2c5b1173c06 100644
--- a/tensorflow/contrib/distributions/python/ops/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -22,8 +22,6 @@ import math
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import special_math
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +31,8 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import special_math
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/multinomial.py
rename to tensorflow/python/ops/distributions/multinomial.py
index e5e24cc87f05f5e000402e0269bb043c76cacf44..a5bea7b4bad0e644cb7776446195f2734750ce7e 100644
--- a/tensorflow/contrib/distributions/python/ops/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -27,6 +25,8 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/normal.py b/tensorflow/python/ops/distributions/normal.py
similarity index 95%
rename from tensorflow/contrib/distributions/python/ops/normal.py
rename to tensorflow/python/ops/distributions/normal.py
index a8bc918af96bd1e10a87967ade0e60aaa6a18f3d..0ef1c91df8c83146fdae086d6056b1d947bae128 100644
--- a/tensorflow/contrib/distributions/python/ops/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import special_math
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +29,9 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import special_math
 
 
 __all__ = [
@@ -70,14 +70,14 @@ class Normal(distribution.Distribution):
 
   ```python
   # Define a single scalar Normal distribution.
-  dist = tf.contrib.distributions.Normal(loc=0., scale=3.)
+  dist = tf.distributions.Normal(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Normals.
   # The first has mean 1 and standard deviation 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
+  dist = tf.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -92,7 +92,7 @@ class Normal(distribution.Distribution):
   ```python
   # Define a batch of two scalar valued Normals.
   # Both have mean 1, but different standard deviations.
-  dist = tf.contrib.distributions.Normal(loc=1., scale=[11, 22.])
+  dist = tf.distributions.Normal(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/special_math.py b/tensorflow/python/ops/distributions/special_math.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/special_math.py
rename to tensorflow/python/ops/distributions/special_math.py
diff --git a/tensorflow/contrib/distributions/python/ops/student_t.py b/tensorflow/python/ops/distributions/student_t.py
similarity index 96%
rename from tensorflow/contrib/distributions/python/ops/student_t.py
rename to tensorflow/python/ops/distributions/student_t.py
index 87f5ecd7ae76f1cdcb4fa3606f97dda9e07af423..073ac4286be170dcfd564f61f1026a85d95c772c 100644
--- a/tensorflow/contrib/distributions/python/ops/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +31,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -42,8 +42,10 @@ __all__ = [
 
 
 class StudentT(distribution.Distribution):
-  # pylint: disable=line-too-long
-  """Student's t-distribution with degree of freedom `df`, location `loc`, and `scale` parameters.
+  """Student's t-distribution.
+
+  This distribution has parameters: degree of freedom `df`, location `loc`,
+  and `scale`.
 
   #### Mathematical details
 
@@ -82,7 +84,7 @@ class StudentT(distribution.Distribution):
 
   ```python
   # Define a single scalar Student t distribution.
-  single_dist = tf.contrib.distributions.StudentT(df=3)
+  single_dist = tf.distributions.StudentT(df=3)
 
   # Evaluate the pdf at 1, returning a scalar Tensor.
   single_dist.prob(1.)
@@ -90,7 +92,7 @@ class StudentT(distribution.Distribution):
   # Define a batch of two scalar valued Student t's.
   # The first has degrees of freedom 2, mean 1, and scale 11.
   # The second 3, 2 and 22.
-  multi_dist = tf.contrib.distributions.StudentT(df=[2, 3],
+  multi_dist = tf.distributions.StudentT(df=[2, 3],
                                                  loc=[1, 2.],
                                                  scale=[11, 22.])
 
@@ -107,7 +109,7 @@ class StudentT(distribution.Distribution):
   ```python
   # Define a batch of two Student's t distributions.
   # Both have df 2 and mean 1, but different scales.
-  dist = tf.contrib.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
+  dist = tf.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/transformed_distribution.py
rename to tensorflow/python/ops/distributions/transformed_distribution.py
index 1403adbda21e8949253161f493bba7b463130c28..09b26a9fb73bca4415351fe0f8d717193c77a70f 100644
--- a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -19,11 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution as distribution_lib
-from tensorflow.contrib.distributions.python.ops import distribution_util
 # Bijectors must be directly imported because `remove_undocumented` prevents
 # individual file imports.
-from tensorflow.contrib.distributions.python.ops.bijectors.identity import Identity
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +30,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import identity_bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = [
     "TransformedDistribution",
@@ -265,7 +265,7 @@ class TransformedDistribution(distribution_lib.Distribution):
       self._empty = constant_op.constant([], dtype=dtypes.int32, name="empty")
 
       if bijector is None:
-        bijector = Identity(validate_args=validate_args)
+        bijector = identity_bijector.Identity(validate_args=validate_args)
 
       # We will keep track of a static and dynamic version of
       # self._is_{batch,event}_override. This way we can do more prior to graph
diff --git a/tensorflow/contrib/distributions/python/ops/uniform.py b/tensorflow/python/ops/distributions/uniform.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/uniform.py
rename to tensorflow/python/ops/distributions/uniform.py
index 81a4c8cdefeeb1ffcab96cd0af717fbfee700cad..9b555f87eae14fe30ff020f996778a4ad8f98ab9 100644
--- a/tensorflow/contrib/distributions/python/ops/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
 
 
 class Uniform(distribution.Distribution):
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..74a7b53a3cecf50e087034e53fe0fd5bc9c9af43
--- /dev/null
+++ b/tensorflow/python/ops/distributions/util.py
@@ -0,0 +1,693 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for probability distributions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import hashlib
+import math
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+
+
+def assert_close(
+    x, y, data=None, summarize=None, message=None, name="assert_close"):
+  """Assert that that x and y are within machine epsilon of each other.
+
+  Args:
+    x: Floating-point `Tensor`
+    y: Floating-point `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if |x - y| > machine epsilon.
+  """
+  message = message or ""
+  x = ops.convert_to_tensor(x, name="x")
+  y = ops.convert_to_tensor(y, name="y")
+
+  if data is None:
+    data = [
+        message,
+        "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
+        y.name, y
+    ]
+
+  if x.dtype.is_integer:
+    return check_ops.assert_equal(
+        x, y, data=data, summarize=summarize, message=message, name=name)
+
+  with ops.name_scope(name, "assert_close", [x, y, data]):
+    tol = np.finfo(x.dtype.as_numpy_dtype).eps
+    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
+    return control_flow_ops.Assert(
+        condition, data, summarize=summarize)
+
+
+def assert_integer_form(
+    x, data=None, summarize=None, message=None, name="assert_integer_form"):
+  """Assert that x has integer components (or floats equal to integers).
+
+  Args:
+    x: Floating-point `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if round(x) != x.
+  """
+
+  message = message or "x has non-integer components"
+  x = ops.convert_to_tensor(x, name="x")
+  casted_x = math_ops.to_int64(x)
+  return check_ops.assert_equal(
+      x, math_ops.cast(math_ops.round(casted_x), x.dtype),
+      data=data, summarize=summarize, message=message, name=name)
+
+
+def assert_symmetric(matrix):
+  matrix_t = array_ops.matrix_transpose(matrix)
+  return control_flow_ops.with_dependencies(
+      [check_ops.assert_equal(matrix, matrix_t)], matrix)
+
+
+def embed_check_nonnegative_discrete(x, check_integer=True):
+  """Assert x is a non-negative tensor, and optionally of integers."""
+  assertions = [check_ops.assert_non_negative(
+      x, message="x must be non-negative.")]
+  if check_integer:
+    assertions += [assert_integer_form(
+        x, message="x cannot contain fractional components.")]
+  return control_flow_ops.with_dependencies(assertions, x)
+
+
+def same_dynamic_shape(a, b):
+  """Returns whether a and b have the same dynamic shape.
+
+  Args:
+    a: `Tensor`
+    b: `Tensor`
+
+  Returns:
+    `bool` `Tensor` representing if both tensors have the same shape.
+  """
+  a = ops.convert_to_tensor(a, name="a")
+  b = ops.convert_to_tensor(b, name="b")
+
+  # Here we can't just do math_ops.equal(a.shape, b.shape), since
+  # static shape inference may break the equality comparison between
+  # shape(a) and shape(b) in math_ops.equal.
+  def all_shapes_equal():
+    return math_ops.reduce_all(math_ops.equal(
+        array_ops.concat([array_ops.shape(a), array_ops.shape(b)], 0),
+        array_ops.concat([array_ops.shape(b), array_ops.shape(a)], 0)))
+
+  # One of the shapes isn't fully defined, so we need to use the dynamic
+  # shape.
+  return control_flow_ops.cond(
+      math_ops.equal(array_ops.rank(a), array_ops.rank(b)),
+      all_shapes_equal,
+      lambda: constant_op.constant(False))
+
+
+def get_logits_and_probs(logits=None,
+                         probs=None,
+                         multidimensional=False,
+                         validate_args=False,
+                         name="get_logits_and_probs"):
+  """Converts logit to probabilities (or vice-versa), and returns both.
+
+  Args:
+    logits: Floating-point `Tensor` representing log-odds.
+    probs: Floating-point `Tensor` representing probabilities.
+    multidimensional: Python `bool`, default `False`.
+      If `True`, represents whether the last dimension of `logits` or `probs`,
+      a `[N1, N2, ...  k]` dimensional tensor, representing the
+      logit or probability of `shape[-1]` classes.
+    validate_args: Python `bool`, default `False`. When `True`, either assert
+      `0 <= probs <= 1` (if not `multidimensional`) or that the last dimension
+      of `probs` sums to one.
+    name: A name for this operation (optional).
+
+  Returns:
+    logits, probs: Tuple of `Tensor`s. If `probs` has an entry that is `0` or
+      `1`, then the corresponding entry in the returned logit will be `-Inf` and
+      `Inf` respectively.
+
+  Raises:
+    ValueError: if neither `probs` nor `logits` were passed in, or both were.
+  """
+  with ops.name_scope(name, values=[probs, logits]):
+    if (probs is None) == (logits is None):
+      raise ValueError("Must pass probs or logits, but not both.")
+
+    if probs is None:
+      logits = ops.convert_to_tensor(logits, name="logits")
+      if multidimensional:
+        return logits, nn.softmax(logits, name="probs")
+      return logits, math_ops.sigmoid(logits, name="probs")
+
+    probs = ops.convert_to_tensor(probs, name="probs")
+    if validate_args:
+      with ops.name_scope("validate_probs"):
+        one = constant_op.constant(1., probs.dtype)
+        dependencies = [check_ops.assert_non_negative(probs)]
+        if multidimensional:
+          dependencies += [assert_close(math_ops.reduce_sum(probs, -1), one,
+                                        message="probs does not sum to 1.")]
+        else:
+          dependencies += [check_ops.assert_less_equal(
+              probs, one, message="probs has components greater than 1.")]
+        probs = control_flow_ops.with_dependencies(dependencies, probs)
+
+    with ops.name_scope("logits"):
+      if multidimensional:
+        # Here we don't compute the multidimensional case, in a manner
+        # consistent with respect to the unidimensional case. We do so
+        # following the TF convention. Typically, you might expect to see
+        # logits = log(probs) - log(probs[pivot]). A side-effect of
+        # being consistent with the TF approach is that the unidimensional case
+        # implicitly handles the second dimension but the multidimensional case
+        # explicitly keeps the pivot dimension.
+        return math_ops.log(probs), probs
+      return math_ops.log(probs) - math_ops.log1p(-1. * probs), probs
+
+
+def log_combinations(n, counts, name="log_combinations"):
+  """Multinomial coefficient.
+
+  Given `n` and `counts`, where `counts` has last dimension `k`, we compute
+  the multinomial coefficient as:
+
+  ```n! / sum_i n_i!```
+
+  where `i` runs over all `k` classes.
+
+  Args:
+    n: Floating-point `Tensor` broadcastable with `counts`. This represents `n`
+      outcomes.
+    counts: Floating-point `Tensor` broadcastable with `n`. This represents
+      counts in `k` classes, where `k` is the last dimension of the tensor.
+    name: A name for this operation (optional).
+
+  Returns:
+    `Tensor` representing the multinomial coefficient between `n` and `counts`.
+  """
+  # First a bit about the number of ways counts could have come in:
+  # E.g. if counts = [1, 2], then this is 3 choose 2.
+  # In general, this is (sum counts)! / sum(counts!)
+  # The sum should be along the last dimension of counts. This is the
+  # "distribution" dimension. Here n a priori represents the sum of counts.
+  with ops.name_scope(name, values=[n, counts]):
+    n = ops.convert_to_tensor(n, name="n")
+    counts = ops.convert_to_tensor(counts, name="counts")
+    total_permutations = math_ops.lgamma(n + 1)
+    counts_factorial = math_ops.lgamma(counts + 1)
+    redundant_permutations = math_ops.reduce_sum(counts_factorial, axis=[-1])
+    return total_permutations - redundant_permutations
+
+
+def matrix_diag_transform(matrix, transform=None, name=None):
+  """Transform diagonal of [batch-]matrix, leave rest of matrix unchanged.
+
+  Create a trainable covariance defined by a Cholesky factor:
+
+  ```python
+  # Transform network layer into 2 x 2 array.
+  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
+  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
+
+  # Make the diagonal positive. If the upper triangle was zero, this would be a
+  # valid Cholesky factor.
+  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
+
+  # OperatorPDCholesky ignores the upper triangle.
+  operator = OperatorPDCholesky(chol)
+  ```
+
+  Example of heteroskedastic 2-D linear regression.
+
+  ```python
+  # Get a trainable Cholesky factor.
+  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
+  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
+  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
+
+  # Get a trainable mean.
+  mu = tf.contrib.layers.fully_connected(activations, 2)
+
+  # This is a fully trainable multivariate normal!
+  dist = tf.contrib.distributions.MVNCholesky(mu, chol)
+
+  # Standard log loss. Minimizing this will "train" mu and chol, and then dist
+  # will be a distribution predicting labels as multivariate Gaussians.
+  loss = -1 * tf.reduce_mean(dist.log_prob(labels))
+  ```
+
+  Args:
+    matrix:  Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
+      equal.
+    transform:  Element-wise function mapping `Tensors` to `Tensors`. To
+      be applied to the diagonal of `matrix`. If `None`, `matrix` is returned
+      unchanged. Defaults to `None`.
+    name:  A name to give created ops.
+      Defaults to "matrix_diag_transform".
+
+  Returns:
+    A `Tensor` with same shape and `dtype` as `matrix`.
+  """
+  with ops.name_scope(name, "matrix_diag_transform", [matrix]):
+    matrix = ops.convert_to_tensor(matrix, name="matrix")
+    if transform is None:
+      return matrix
+    # Replace the diag with transformed diag.
+    diag = array_ops.matrix_diag_part(matrix)
+    transformed_diag = transform(diag)
+    transformed_mat = array_ops.matrix_set_diag(matrix, transformed_diag)
+
+  return transformed_mat
+
+
+def rotate_transpose(x, shift, name="rotate_transpose"):
+  """Circularly moves dims left or right.
+
+  Effectively identical to:
+
+  ```python
+  numpy.transpose(x, numpy.roll(numpy.arange(len(x.shape)), shift))
+  ```
+
+  When `validate_args=False` additional graph-runtime checks are
+  performed. These checks entail moving data from to GPU to CPU.
+
+  Example:
+
+    ```python
+    x = ...  # Tensor of shape [1, 2, 3, 4].
+    rotate_transpose(x, -1)  # result shape: [2, 3, 4, 1]
+    rotate_transpose(x, -2)  # result shape: [3, 4, 1, 2]
+    rotate_transpose(x,  1)  # result shape: [4, 1, 2, 3]
+    rotate_transpose(x,  2)  # result shape: [3, 4, 1, 2]
+    rotate_transpose(x, 7) == rotate_transpose(x, 3)
+    rotate_transpose(x, -7) == rotate_transpose(x, -3)
+    ```
+
+  Args:
+    x: `Tensor`.
+    shift: `Tensor`. Number of dimensions to transpose left (shift<0) or
+      transpose right (shift>0).
+    name: Python `str`. The name to give this op.
+
+  Returns:
+    rotated_x: Input `Tensor` with dimensions circularly rotated by shift.
+
+  Raises:
+    TypeError: if shift is not integer type.
+  """
+  with ops.name_scope(name, values=[x, shift]):
+    x = ops.convert_to_tensor(x, name="x")
+    shift = ops.convert_to_tensor(shift, name="shift")
+    # We do not assign back to preserve constant-ness.
+    check_ops.assert_integer(shift)
+    shift_value_static = tensor_util.constant_value(shift)
+    ndims = x.get_shape().ndims
+    if ndims is not None and shift_value_static is not None:
+      if ndims < 2: return x
+      shift_value_static = np.sign(shift_value_static) * (
+          abs(shift_value_static) % ndims)
+      if shift_value_static == 0: return x
+      perm = np.roll(np.arange(ndims), shift_value_static)
+      return array_ops.transpose(x, perm=perm)
+    else:
+      # Consider if we always had a positive shift, and some specified
+      # direction.
+      # When shifting left we want the new array:
+      #   last(x, n-shift) + first(x, shift)
+      # and if shifting right then we want:
+      #   last(x, shift) + first(x, n-shift)
+      # Observe that last(a) == slice(a, n) and first(a) == slice(0, a).
+      # Also, we can encode direction and shift as one: direction * shift.
+      # Combining these facts, we have:
+      #   a = cond(shift<0, -shift, n-shift)
+      #   last(x, n-a) + first(x, a) == x[a:n] + x[0:a]
+      # Finally, we transform shift by modulo length so it can be specified
+      # independently from the array upon which it operates (like python).
+      ndims = array_ops.rank(x)
+      shift = array_ops.where(math_ops.less(shift, 0),
+                              math_ops.mod(-shift, ndims),
+                              ndims - math_ops.mod(shift, ndims))
+      first = math_ops.range(0, shift)
+      last = math_ops.range(shift, ndims)
+      perm = array_ops.concat([last, first], 0)
+      return array_ops.transpose(x, perm=perm)
+
+
+def pick_vector(cond,
+                true_vector,
+                false_vector,
+                name="pick_vector"):
+  """Picks possibly different length row `Tensor`s based on condition.
+
+  Value `Tensor`s should have exactly one dimension.
+
+  If `cond` is a python Boolean or `tf.constant` then either `true_vector` or
+  `false_vector` is immediately returned. I.e., no graph nodes are created and
+  no validation happens.
+
+  Args:
+    cond: `Tensor`. Must have `dtype=tf.bool` and be scalar.
+    true_vector: `Tensor` of one dimension. Returned when cond is `True`.
+    false_vector: `Tensor` of one dimension. Returned when cond is `False`.
+    name: Python `str`. The name to give this op.
+
+  Example:
+
+  ```python
+  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))
+  # result is tensor: [10, 11].
+  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))
+  # result is tensor: [15, 16, 17].
+  ```
+
+  Returns:
+    true_or_false_vector: `Tensor`.
+
+  Raises:
+    TypeError: if `cond.dtype != tf.bool`
+    TypeError: if `cond` is not a constant and
+      `true_vector.dtype != false_vector.dtype`
+  """
+  with ops.name_scope(name, values=(cond, true_vector, false_vector)):
+    cond = ops.convert_to_tensor(cond, name="cond")
+    if cond.dtype != dtypes.bool:
+      raise TypeError("%s.dtype=%s which is not %s" %
+                      (cond.name, cond.dtype, dtypes.bool))
+    cond_value_static = tensor_util.constant_value(cond)
+    if cond_value_static is not None:
+      return true_vector if cond_value_static else false_vector
+    true_vector = ops.convert_to_tensor(true_vector, name="true_vector")
+    false_vector = ops.convert_to_tensor(false_vector, name="false_vector")
+    if true_vector.dtype != false_vector.dtype:
+      raise TypeError(
+          "%s.dtype=%s does not match %s.dtype=%s"
+          % (true_vector.name, true_vector.dtype,
+             false_vector.name, false_vector.dtype))
+    n = array_ops.shape(true_vector)[0]
+    return array_ops.slice(
+        array_ops.concat([true_vector, false_vector], 0),
+        [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
+
+
+def gen_new_seed(seed, salt):
+  """Generate a new seed, from the given seed and salt."""
+  if seed is None:
+    return None
+  string = (str(seed) + salt).encode("utf-8")
+  return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
+
+
+def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
+  """Creates a (batch of) lower triangular matrix from a vector of inputs.
+
+  If `x.get_shape()` is `[b1, b2, ..., bK, d]` then the output shape is `[b1,
+  b2, ..., bK, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
+  `n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))`.
+
+  Although the non-batch complexity is O(n**2), large constants and sub-optimal
+  vectorization means the complexity of this function is 5x slower than zeroing
+  out the upper triangular, i.e., `tf.matrix_band_part(X, -1, 0)`. This
+  function becomes competitive only when several matmul/cholesky/etc ops can be
+  ellided in constructing the input. Example: wiring a fully connected layer as
+  a covariance matrix; this function reduces the final layer by 2x and possibly
+  reduces the network arch complexity considerably. In most cases it is better
+  to simply build a full matrix and zero out the upper triangular elements,
+  e.g., `tril = tf.matrix_band_part(full, -1, 0)`, rather than directly
+  construct a lower triangular.
+
+  Example:
+
+  ```python
+  fill_lower_triangular([1, 2, 3, 4, 5, 6])
+  # Returns: [[1, 0, 0],
+  #           [2, 3, 0],
+  #           [4, 5, 6]]
+  ```
+
+  For comparison, a pure numpy version of this function can be found in
+  `distribution_util_test.py`, function `_fill_lower_triangular`.
+
+  Args:
+    x: `Tensor` representing lower triangular elements.
+    validate_args: Python `bool`, default `False`. Whether to ensure the shape
+      of `x` can be mapped to a lower triangular matrix (controls non-static
+      checks only).
+    name: Python `str`. The name to give this op.
+
+  Returns:
+    tril: `Tensor` with lower triangular elements filled from `x`.
+
+  Raises:
+    ValueError: if shape if `x` has static shape which cannot be mapped to a
+      lower triangular matrix.
+  """
+  # TODO(jvdillon): Replace this code with dedicated op when it exists.
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    if (x.get_shape().ndims is not None and
+        x.get_shape()[-1].value is not None):
+      d = x.get_shape()[-1].value
+      # d = n(n+1)/2 implies n is:
+      n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))
+      d_inferred = n * (n + 1) /2
+      if d != d_inferred:
+        raise ValueError("Input cannot be mapped to a lower triangular; "
+                         "n*(n+1)/2 = %d != %d" % (d_inferred, d))
+      final_shape = x.get_shape()[:-1].concatenate(
+          tensor_shape.TensorShape([n, n]))
+    else:
+      d = math_ops.cast(array_ops.shape(x)[-1], dtype=dtypes.float32)
+      # d = n(n+1)/2 implies n is:
+      n = math_ops.cast(0.5 * (dtypes.sqrt(1. + 8. * d) - 1.),
+                        dtype=dtypes.int32)
+      if validate_args:
+        is_valid_input_shape = check_ops.assert_equal(
+            n * (n + 1) / 2, d,
+            message="Input cannot be mapped to a lower triangular.")
+        n = control_flow_ops.with_dependencies([is_valid_input_shape], n)
+      final_shape = x.get_shape()[:-1].concatenate(
+          tensor_shape.TensorShape([None, None]))
+
+    def tril_ids(n):
+      """Internal helper to create vector of linear indices into y."""
+      # Build the ids statically; chose 512 because it implies 1MiB.
+      if not tensor_util.is_tensor(n) and n <= 512:
+        ids = np.arange(n**2, dtype=np.int32)
+        rows = (ids / n).astype(np.int32)  # Implicit floor.
+        # We need to stop incrementing the index when we encounter
+        # upper-triangular elements. The idea here is to compute the
+        # lower-right number of zeros then by "symmetry" subtract this from the
+        # total number of zeros, n(n-1)/2.
+        # Then we note that: n(n-1)/2 - (n-r)*(n-r-1)/2 = r(2n-r-1)/2
+        offset = (rows * (2 * n - rows - 1) / 2).astype(np.int32)
+        # We could also zero out when (rows < cols) == (rows < ids-n*rows).
+        # mask = (ids <= (n + 1) * rows).astype(np.int32)
+      else:
+        ids = math_ops.range(n**2)
+        rows = math_ops.cast(ids / n, dtype=dtypes.int32)
+        offset = math_ops.cast(rows * (2 * n - rows - 1) / 2,
+                               dtype=dtypes.int32)
+      return ids - offset
+
+    # Special-case non-batch case.
+    if x.get_shape().ndims == 1:
+      y = array_ops.gather(x, array_ops.reshape(tril_ids(n), [n, n]))
+      y = array_ops.matrix_band_part(y, -1, 0)
+      y.set_shape(y.get_shape().merge_with(final_shape))
+      return y
+
+    # Make ids for each batch dim.
+    if (x.get_shape().ndims is not None and
+        x.get_shape()[:-1].is_fully_defined()):
+      batch_shape = np.asarray(x.get_shape()[:-1].as_list(), dtype=np.int32)
+      m = np.prod(batch_shape).astype(np.int32)
+    else:
+      batch_shape = array_ops.shape(x)[:-1]
+      m = array_ops.reduce_prod(array_ops.shape(x)[:-1])
+    batch_ids = math_ops.range(m)
+
+    # Assemble the tril_ids into batch,tril_id pairs.
+    idx = array_ops.stack([
+        array_ops.tile(array_ops.expand_dims(batch_ids, 1), [1, n * n]),
+        array_ops.tile(array_ops.expand_dims(tril_ids(n), 0), [m, 1])
+    ])
+    idx = array_ops.transpose(idx, [1, 2, 0])
+
+    # Gather up, reshape, and return.
+    y = array_ops.reshape(x, [-1, d])
+    y = array_ops.gather_nd(y, idx)
+    y = array_ops.reshape(y, array_ops.concat([batch_shape, [n, n]], 0))
+    y = array_ops.matrix_band_part(y, -1, 0)
+    y.set_shape(y.get_shape().merge_with(final_shape))
+    return y
+
+
+# TODO(jvdillon): Merge this test back into:
+# tensorflow/python/ops/softplus_op_test.py
+# once TF core is accepting new ops.
+def softplus_inverse(x, name=None):
+  """Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)).
+
+  Mathematically this op is equivalent to:
+
+  ```none
+  softplus_inverse = log(exp(x) - 1.)
+  ```
+
+  Args:
+    x: `Tensor`. Non-negative (not enforced), floating-point.
+    name: A name for the operation (optional).
+
+  Returns:
+    `Tensor`. Has the same type/shape as input `x`.
+  """
+  with ops.name_scope(name, "softplus_inverse", values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    # We begin by deriving a more numerically stable softplus_inverse:
+    # x = softplus(y) = Log[1 + exp{y}], (which means x > 0).
+    # ==> exp{x} = 1 + exp{y}                                (1)
+    # ==> y = Log[exp{x} - 1]                                (2)
+    #       = Log[(exp{x} - 1) / exp{x}] + Log[exp{x}]
+    #       = Log[(1 - exp{-x}) / 1] + Log[exp{x}]
+    #       = Log[1 - exp{-x}] + x                           (3)
+    # (2) is the "obvious" inverse, but (3) is more stable than (2) for large x.
+    # For small x (e.g. x = 1e-10), (3) will become -inf since 1 - exp{-x} will
+    # be zero. To fix this, we use 1 - exp{-x} approx x for small x > 0.
+    #
+    # In addition to the numerically stable derivation above, we clamp
+    # small/large values to be congruent with the logic in:
+    # tensorflow/core/kernels/softplus_op.h
+    #
+    # Finally, we set the input to one whenever the input is too large or too
+    # small. This ensures that no unchosen codepath is +/- inf. This is
+    # necessary to ensure the gradient doesn't get NaNs. Recall that the
+    # gradient of `where` behaves like `pred*pred_true + (1-pred)*pred_false`
+    # thus an `inf` in an unselected path results in `0*inf=nan`. We are careful
+    # to overwrite `x` with ones only when we will never actually use this
+    # value. Note that we use ones and not zeros since `log(expm1(0.)) = -inf`.
+    threshold = np.log(np.finfo(x.dtype.as_numpy_dtype).eps) + 2.
+    is_too_small = math_ops.less(x, np.exp(threshold))
+    is_too_large = math_ops.greater(x, -threshold)
+    too_small_value = math_ops.log(x)
+    too_large_value = x
+    # This `where` will ultimately be a NOP because we won't select this
+    # codepath whenever we used the surrogate `ones_like`.
+    x = array_ops.where(math_ops.logical_or(is_too_small, is_too_large),
+                        array_ops.ones_like(x), x)
+    y = x + math_ops.log(-math_ops.expm1(-x))  # == log(expm1(x))
+    return array_ops.where(is_too_small, too_small_value,
+                           array_ops.where(is_too_large, too_large_value, y))
+
+
+# TODO(b/35290280): Add unit-tests.
+def dimension_size(x, axis):
+  """Returns the size of a specific dimension."""
+  # Since tf.gather isn't "constant-in, constant-out", we must first check the
+  # static shape or fallback to dynamic shape.
+  num_rows = (None if x.get_shape().ndims is None
+              else x.get_shape()[axis].value)
+  if num_rows is not None:
+    return num_rows
+  return array_ops.shape(x)[axis]
+
+
+class AppendDocstring(object):
+  """Helper class to promote private subclass docstring to public counterpart.
+
+  Example:
+
+  ```python
+  class TransformedDistribution(Distribution):
+    @distribution_util.AppendDocstring(
+      additional_note="A special note!",
+      kwargs_dict={"foo": "An extra arg."})
+    def _prob(self, y, foo=None):
+      pass
+  ```
+
+  In this case, the `AppendDocstring` decorator appends the `additional_note` to
+  the docstring of `prob` (not `_prob`) and adds a new `kwargs`
+  section with each dictionary item as a bullet-point.
+
+  For a more detailed example, see `TransformedDistribution`.
+  """
+
+  def __init__(self, additional_note="", kwargs_dict=None):
+    """Initializes the AppendDocstring object.
+
+    Args:
+      additional_note: Python string added as additional docstring to public
+        version of function.
+      kwargs_dict: Python string/string dictionary representing
+        specific kwargs expanded from the **kwargs input.
+
+    Raises:
+      ValueError: if kwargs_dict.key contains whitespace.
+      ValueError: if kwargs_dict.value contains newlines.
+    """
+    self._additional_note = additional_note
+    if kwargs_dict:
+      bullets = []
+      for key in sorted(kwargs_dict.keys()):
+        value = kwargs_dict[key]
+        if any(x.isspace() for x in key):
+          raise ValueError(
+              "Parameter name \"%s\" contains whitespace." % key)
+        value = value.lstrip()
+        if "\n" in value:
+          raise ValueError(
+              "Parameter description for \"%s\" contains newlines." % key)
+        bullets.append("*  `%s`: %s" % (key, value))
+      self._additional_note += ("\n\n##### `kwargs`:\n\n" +
+                                "\n".join(bullets))
+
+  def __call__(self, fn):
+    @functools.wraps(fn)
+    def _fn(*args, **kwargs):
+      return fn(*args, **kwargs)
+    if _fn.__doc__ is None:
+      _fn.__doc__ = self._additional_note
+    else:
+      _fn.__doc__ += "\n%s" % self._additional_note
+    return _fn
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 168ca7fefcc508947b6d34f74b923f7c9f48daf7..315e7d4b43cc5cecdd744d72b0187a61d4913b47 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -26,6 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+# Imports gradient definitions.
+from tensorflow.python.ops import data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 2adf8f05d898780964659ac18a4d444d83d23e55..bd8a5c86acc7501b30489c78b714293ee14763c1 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -273,28 +273,6 @@ def _VerifyGeneratedGradients(grads, op):
   if len(grads) != len(op.inputs):
     raise ValueError("Num gradients %d generated for op %s do not match num "
                      "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
-    for i in xrange(len(grads)):
-      grad = grads[i]
-      inp = op.inputs[i]
-      if grad is None:
-        continue
-      if grad.dtype.is_floating:
-        if not inp.dtype.is_floating:
-          raise TypeError("Gradient type %s generated for real-valued op %s "
-                           "with type %s must be real" %
-                           (dtypes.as_dtype(grad.dtype).name, op.node_def,
-                            dtypes.as_dtype(inp.dtype).name))
-      elif grad.dtype.is_complex:
-        if not inp.dtype.is_complex:
-          raise TypeError("Gradient type %s generated for complex-valued op %s"
-                           " with type %s must be complex" %
-                           (dtypes.as_dtype(grad.dtype).name, op.node_def,
-                            dtypes.as_dtype(inp.dtype).name))
-      else:
-        raise TypeError("Gradient type %s generated for op %s "
-                         "with type %s must be either real or complex" %
-                         (dtypes.as_dtype(grad.dtype).name, op.node_def,
-                          dtypes.as_dtype(inp.dtype).name))
 
 
 def _StopOps(from_ops, pending_count):
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index f302477068bcb7f267cbab970aae712174e26f1e..57a5b982abfc8211b25808c14d2aefb2bb889b66 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -63,16 +63,27 @@ GetSessionHandle
 GetSessionHandleV2
 GetSessionTensor
 HashTable
+HashTableV2
 InitializeTable
+InitializeTableV2
 InitializeTableFromTextFile
+InitializeTableFromTextFileV2
 LookupTableExport
+LookupTableExportV2
 LookupTableFind
+LookupTableFindV2
 LookupTableImport
+LookupTableImportV2
 LookupTableInsert
+LookupTableInsertV2
 LookupTableSize
+LookupTableSizeV2
 MutableDenseHashTable
+MutableDenseHashTableV2
 MutableHashTable
+MutableHashTableV2
 MutableHashTableOfTensors
+MutableHashTableOfTensorsV2
 Mutex
 MutexAcquire
 MutexRelease
@@ -220,6 +231,7 @@ BatchFFT3D
 BatchIFFT
 BatchIFFT2D
 BatchIFFT3D
+Bucketize
 Complex
 ComplexAbs
 Conj
@@ -318,6 +330,7 @@ SerializeSparse
 SparseAdd
 SparseAddGrad
 SparseConcat
+SparseCross
 SparseSplit
 SparseSelectLastK
 SparseReorder
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index c29ae26f04ed07cb44ba821e3a3c5296faa2840d..2aad1e15191bd2dc3f75a2ae3cfa39992963cb6c 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -59,6 +59,7 @@ See the @{$python/image} guide.
 @@per_image_standardization
 @@draw_bounding_boxes
 @@non_max_suppression
+@@non_max_suppression_v2
 @@sample_distorted_bounding_box
 @@total_variation
 """
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 78621d3b5707a527c5ed930e9e6369c7e6f8b38a..ae7999a71e19cc0cf34a72d5f277df072e7ea8ee 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -32,9 +32,9 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 
 
@@ -52,6 +52,7 @@ ops.NotDifferentiable('SampleDistortedBoundingBox')
 # latent bugs here.
 ops.NotDifferentiable('ExtractGlimpse')
 ops.NotDifferentiable('NonMaxSuppression')
+ops.NotDifferentiable('NonMaxSuppressionV2')
 
 
 def _assert(cond, ex_type, msg):
@@ -218,7 +219,8 @@ def random_flip_up_down(image, seed=None):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
   mirror_cond = math_ops.less(uniform_random, .5)
   result = control_flow_ops.cond(mirror_cond,
@@ -246,7 +248,8 @@ def random_flip_left_right(image, seed=None):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
   mirror_cond = math_ops.less(uniform_random, .5)
   result = control_flow_ops.cond(mirror_cond,
@@ -273,7 +276,8 @@ def flip_left_right(image):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
 
 
@@ -295,7 +299,8 @@ def flip_up_down(image):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
 
 
@@ -312,7 +317,8 @@ def rot90(image, k=1, name=None):
   """
   with ops.name_scope(name, 'rot90', [image, k]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    _Check3DImage(image, require_static=False)
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
     k = ops.convert_to_tensor(k, dtype=dtypes.int32, name='k')
     k.get_shape().assert_has_rank(0)
     k = math_ops.mod(k, 4)
@@ -350,7 +356,8 @@ def transpose_image(image):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
 
 
@@ -379,12 +386,14 @@ def central_crop(image, central_fraction):
     3-D float Tensor
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
   if central_fraction <= 0.0 or central_fraction > 1.0:
     raise ValueError('central_fraction must be within (0, 1]')
   if central_fraction == 1.0:
     return image
 
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
+
   img_shape = array_ops.shape(image)
   depth = image.get_shape()[2]
   fraction_offset = int(1 / ((1 - central_fraction) / 2.0))
@@ -435,9 +444,6 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   """
   image = ops.convert_to_tensor(image, name='image')
 
-  assert_ops = []
-  assert_ops += _CheckAtLeast3DImage(image, require_static=False)
-
   is_batch = True
   image_shape = image.get_shape()
   if image_shape.ndims == 3:
@@ -450,6 +456,8 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   elif image_shape.ndims != 4:
     raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
+  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+
   batch, height, width, depth = _ImageDimensions(image, rank=4)
 
   after_padding_width = target_width - offset_width - width
@@ -515,9 +523,6 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
   """
   image = ops.convert_to_tensor(image, name='image')
 
-  assert_ops = []
-  assert_ops += _CheckAtLeast3DImage(image, require_static=False)
-
   is_batch = True
   image_shape = image.get_shape()
   if image_shape.ndims == 3:
@@ -530,6 +535,8 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
   elif image_shape.ndims != 4:
     raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
+  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+
   batch, height, width, depth = _ImageDimensions(image, rank=4)
 
   assert_ops += _assert(offset_width >= 0, ValueError,
@@ -602,8 +609,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
   elif image_shape.ndims != 4:
     raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
-  assert_ops = []
-  assert_ops += _CheckAtLeast3DImage(image, require_static=False)
+  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
   assert_ops += _assert(target_width > 0, ValueError,
                         'target_width must be > 0.')
   assert_ops += _assert(target_height > 0, ValueError,
@@ -614,7 +620,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
   # Make sure our checks come first, so that error messages are clearer.
   if _is_tensor(target_height):
     target_height = control_flow_ops.with_dependencies(
-      assert_ops, target_height)
+        assert_ops, target_height)
   if _is_tensor(target_width):
     target_width = control_flow_ops.with_dependencies(assert_ops, target_width)
 
@@ -693,9 +699,12 @@ def resize_images(images,
 
   `method` can be one of:
 
-  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](https://en.wikipedia.org/wiki/Bilinear_interpolation)
-  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](https://en.wikipedia.org/wiki/Bicubic_interpolation)
+  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](
+    https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
+  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](
+    https://en.wikipedia.org/wiki/Bicubic_interpolation)
   *   <b>`ResizeMethod.AREA`</b>: Area interpolation.
 
   Args:
@@ -800,7 +809,8 @@ def per_image_standardization(image):
     ValueError: if the shape of 'image' is incompatible with this function.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   num_pixels = math_ops.reduce_prod(array_ops.shape(image))
 
   image = math_ops.cast(image, dtype=dtypes.float32)
@@ -955,6 +965,7 @@ def adjust_contrast(images, contrast_factor):
 
 def adjust_gamma(image, gamma=1, gain=1):
   """Performs Gamma Correction on the input image.
+
     Also known as Power Law Transform. This function transforms the
     input image pixelwise according to the equation Out = In**gamma
     after scaling each pixel to the range 0 to 1.
@@ -967,6 +978,9 @@ def adjust_gamma(image, gamma=1, gain=1):
   Returns:
     A Tensor. Gamma corrected output image.
 
+  Raises:
+    ValueError: If gamma is negative.
+
   Notes:
     For gamma greater than 1, the histogram will shift towards left and
     the output image will be darker than the input image.
@@ -977,16 +991,17 @@ def adjust_gamma(image, gamma=1, gain=1):
     [1] http://en.wikipedia.org/wiki/Gamma_correction
   """
 
-  with ops.op_scope([image, gamma, gain], None, 'adjust_gamma') as name:
+  with ops.op_scope([image, gamma, gain], None, 'adjust_gamma'):
     # Convert pixel value to DT_FLOAT for computing adjusted image
     img = ops.convert_to_tensor(image, name='img', dtype=dtypes.float32)
     # Keep image dtype for computing the scale of corresponding dtype
     image = ops.convert_to_tensor(image, name='image')
 
     if gamma < 0:
-      raise ValueError("Gamma should be a non-negative real number")
+      raise ValueError('Gamma should be a non-negative real number')
     # scale = max(dtype) - min(dtype)
-    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0], dtype=dtypes.float32)
+    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0],
+                                 dtype=dtypes.float32)
     # According to the definition of gamma correction
     adjusted_img = (img / scale) ** gamma * scale * gain
 
@@ -1299,6 +1314,7 @@ def adjust_saturation(image, saturation_factor, name=None):
 
 def decode_image(contents, channels=None, name=None):
   """Convenience function for `decode_gif`, `decode_jpeg`, and `decode_png`.
+
   Detects whether an image is a GIF, JPEG, or PNG, and performs the appropriate
   operation to convert the input bytes `string` into a `Tensor` of type `uint8`.
 
@@ -1318,37 +1334,57 @@ def decode_image(contents, channels=None, name=None):
     `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
       JPEG and PNG images and shape `[num_frames, height, width, 3]` for GIF
       images.
+
+  Raises:
+    ValueError: On incorrect number of channels.
   """
-  with ops.name_scope(name, 'decode_image') as scope:
-    if channels not in (None, 0, 1, 3):
-      raise ValueError('channels must be in (None, 0, 1, 3)')
-    substr = string_ops.substr(contents, 0, 4)
+  with ops.name_scope(name, 'decode_image'):
+    if channels not in (None, 0, 1, 3, 4):
+      raise ValueError('channels must be in (None, 0, 1, 3, 4)')
+    substr = string_ops.substr(contents, 0, 3)
 
     def _gif():
+      """Decodes a GIF image."""
       # Create assert op to check that bytes are GIF decodable
-      is_gif = math_ops.equal(substr, b'\x47\x49\x46\x38', name='is_gif')
+      is_gif = math_ops.equal(substr, b'\x47\x49\x46', name='is_gif')
       decode_msg = 'Unable to decode bytes as JPEG, PNG, or GIF'
       assert_decode = control_flow_ops.Assert(is_gif, [decode_msg])
       # Create assert to make sure that channels is not set to 1
       # Already checked above that channels is in (None, 0, 1, 3)
+
       gif_channels = 0 if channels is None else channels
-      good_channels = math_ops.not_equal(gif_channels, 1, name='check_channels')
+      good_channels = math_ops.logical_and(
+          math_ops.not_equal(gif_channels, 1, name='check_gif_channels'),
+          math_ops.not_equal(gif_channels, 4, name='check_gif_channels')
+      )
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
         return gen_image_ops.decode_gif(contents)
 
     def _png():
+      """Decodes a PNG image."""
       return gen_image_ops.decode_png(contents, channels)
 
     def check_png():
-      is_png = math_ops.equal(substr, b'\211PNG', name='is_png')
+      """Checks if an image is PNG."""
+      is_png = math_ops.equal(substr, b'\211PN', name='is_png')
       return control_flow_ops.cond(is_png, _png, _gif, name='cond_png')
 
     def _jpeg():
-      return gen_image_ops.decode_jpeg(contents, channels)
+      """Decodes a jpeg image."""
+      jpeg_channels = 0 if channels is None else channels
+      good_channels = math_ops.not_equal(jpeg_channels, 4,
+                                         name='check_jpeg_channels')
+      channels_msg = ('Channels must be in (None, 0, 1, 3) when decoding JPEG '
+                      'images')
+      assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
+      with ops.control_dependencies([assert_channels]):
+        return gen_image_ops.decode_jpeg(contents, channels)
 
-    is_jpeg = math_ops.equal(substr, b'\xff\xd8\xff\xe0', name='is_jpeg')
+    # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
+    # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
+    is_jpeg = math_ops.equal(substr, b'\xff\xd8\xff', name='is_jpeg')
     return control_flow_ops.cond(is_jpeg, _jpeg, check_png, name='cond_jpeg')
 
 
@@ -1416,7 +1452,7 @@ def total_variation(images, name=None):
 
     # Calculate the total variation by taking the absolute value of the
     # pixel-differences and summing over the appropriate axis.
-    tot_var = math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) + \
-              math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis)
+    tot_var = (math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) +
+               math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis))
 
   return tot_var
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 887140c726ae587beb2417fcfc1361b41b44d90d..492dbe6d135db99059064f870782b1b5bdb3c67e 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import colorsys
+import functools
 import math
 import os
 import time
@@ -1175,12 +1176,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     offset_height, offset_width = [0, 0]
     target_height, target_width = [2, 2]
 
-    for x_shape in ([3, 5],):
-      self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
-                         target_width,
-                         "'image' must be at least three-dimensional.")
-
-    for x_shape in ([1, 3, 5, 1, 1],):
+    for x_shape in ([3, 5], [1, 3, 5, 1, 1]):
       self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
@@ -1426,12 +1422,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     offset_height, offset_width = [0, 0]
     target_height, target_width = [2, 2]
 
-    for x_shape in ([3, 5],):
-      self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
-                         target_width,
-                         "'image' must be at least three-dimensional")
-
-    for x_shape in ([1, 3, 5, 1, 1],):
+    for x_shape in ([3, 5], [1, 3, 5, 1, 1]):
       self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
@@ -1652,8 +1643,8 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self.assertEqual(y.get_shape().as_list(), [None] + post_shape)
 
   def shouldRunOnGPU(self, opt, nptype):
-    if opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR \
-            and nptype in [np.float32, np.float64]:
+    if (opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR and
+        nptype in [np.float32, np.float64]):
       return True
     else:
       return False
@@ -1676,15 +1667,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
       for opt in self.OPTIONS:
-        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
-          with self.test_session(use_gpu=True) as sess:
-            image = constant_op.constant(img_np, shape=img_shape)
-            y = image_ops.resize_images(image, [target_height, target_width],
-                                        opt)
-            yshape = array_ops.shape(y)
-            resized, newshape = sess.run([y, yshape])
-            self.assertAllEqual(img_shape, newshape)
-            self.assertAllClose(resized, img_np, atol=1e-5)
+        with self.test_session(use_gpu=True) as sess:
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images(image, [target_height, target_width], opt)
+          yshape = array_ops.shape(y)
+          resized, newshape = sess.run([y, yshape])
+          self.assertAllEqual(img_shape, newshape)
+          self.assertAllClose(resized, img_np, atol=1e-5)
 
       # Resizing with a single image must leave the shape unchanged also.
       with self.test_session(use_gpu=True):
@@ -1822,7 +1811,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               resized = y.eval()
               self.assertAllClose(resized, expected, atol=1e-5)
 
-  def testResizeUp(self):
+  def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
     data = [64, 32,
             32, 64,
@@ -1857,16 +1846,63 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image_ops.ResizeMethod.BILINEAR,
           image_ops.ResizeMethod.NEAREST_NEIGHBOR,
           image_ops.ResizeMethod.AREA]:
-        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
-          with self.test_session(use_gpu=True):
-            img_np = np.array(data, dtype=nptype).reshape(img_shape)
-            image = constant_op.constant(img_np, shape=img_shape)
-            y = image_ops.resize_images(
-                image, [target_height, target_width], opt)
-            resized = y.eval()
-            expected = np.array(expected_data[opt]).reshape(
-                [1, target_height, target_width, 1])
-            self.assertAllClose(resized, expected, atol=1e-05)
+        with self.test_session(use_gpu=True):
+          img_np = np.array(data, dtype=nptype).reshape(img_shape)
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images(
+              image, [target_height, target_width], opt, align_corners=False)
+          resized = y.eval()
+          expected = np.array(expected_data[opt]).reshape(
+              [1, target_height, target_width, 1])
+          self.assertAllClose(resized, expected, atol=1e-05)
+
+  def testResizeUpAlignCornersTrue(self):
+    img_shape = [1, 3, 2, 1]
+    data = [6, 3,
+            3, 6,
+            6, 9]
+    target_height = 5
+    target_width = 4
+    expected_data = {}
+    expected_data[image_ops.ResizeMethod.BILINEAR] = [
+        6.0, 5.0, 4.0, 3.0,
+        4.5, 4.5, 4.5, 4.5,
+        3.0, 4.0, 5.0, 6.0,
+        4.5, 5.5, 6.5, 7.5,
+        6.0, 7.0, 8.0, 9.0
+    ]
+    expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
+        6.0, 6.0, 3.0, 3.0,
+        3.0, 3.0, 6.0, 6.0,
+        3.0, 3.0, 6.0, 6.0,
+        6.0, 6.0, 9.0, 9.0,
+        6.0, 6.0, 9.0, 9.0
+    ]
+    # TODO(b/37749740): Improve alignment of ResizeMethod.AREA when
+    # align_corners=True.
+    expected_data[image_ops.ResizeMethod.AREA] = [
+        6.0, 6.0, 6.0, 3.0,
+        6.0, 6.0, 6.0, 3.0,
+        3.0, 3.0, 3.0, 6.0,
+        3.0, 3.0, 3.0, 6.0,
+        6.0, 6.0, 6.0, 9.0
+    ]
+
+    for nptype in self.TYPES:
+      for opt in [
+          image_ops.ResizeMethod.BILINEAR,
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+          image_ops.ResizeMethod.AREA
+      ]:
+        with self.test_session(use_gpu=True):
+          img_np = np.array(data, dtype=nptype).reshape(img_shape)
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images(
+              image, [target_height, target_width], opt, align_corners=True)
+          resized = y.eval()
+          expected = np.array(expected_data[opt]).reshape(
+              [1, target_height, target_width, 1])
+          self.assertAllClose(resized, expected, atol=1e-05)
 
   def testResizeUpBicubic(self):
     img_shape = [1, 6, 6, 1]
@@ -2757,5 +2793,37 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     self._test(multi, tot_var * np.array([1.0, 1.1, 1.2]))
 
 
+class FormatTest(test_util.TensorFlowTestCase):
+
+  def testFormats(self):
+    prefix = "tensorflow/core/lib"
+    paths = ("png/testdata/lena_gray.png", "jpeg/testdata/jpeg_merge_test1.jpg",
+             "gif/testdata/lena.gif")
+    decoders = {
+        "jpeg": functools.partial(image_ops.decode_jpeg, channels=3),
+        "png": functools.partial(image_ops.decode_png, channels=3),
+        "gif": lambda s: array_ops.squeeze(image_ops.decode_gif(s), axis=0),
+    }
+    with self.test_session():
+      for path in paths:
+        contents = io_ops.read_file(os.path.join(prefix, path)).eval()
+        images = {}
+        for name, decode in decoders.items():
+          image = decode(contents).eval()
+          self.assertEqual(image.ndim, 3)
+          for prev_name, prev in images.items():
+            print("path %s, names %s %s, shapes %s %s" %
+                  (path, name, prev_name, image.shape, prev.shape))
+            self.assertAllEqual(image, prev)
+          images[name] = image
+
+  def testError(self):
+    path = "tensorflow/core/lib/gif/testdata/scan.gif"
+    with self.test_session():
+      for decode in image_ops.decode_jpeg, image_ops.decode_png:
+        with self.assertRaisesOpError(r"Got 12 frames"):
+          decode(io_ops.read_file(path)).eval()
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 4d922946ffc80fc13efef1c9ba29e065a0b41e31..1e2f999995756ad4b4c432ddfc31c39254818622 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import math_ops
 
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index e2fd25675ec50f1d244fe85241918764b6d62208..9b6420317da6c5e88a4276b0fd32b9326b58b0c7 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -257,7 +257,9 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
     name: string, optional name of the operation.
 
   Returns:
-    s: Singular values. Shape is `[..., P]`.
+    s: Singular values. Shape is `[..., P]`. The values are sorted in reverse
+      order of magnitude, so s[..., 0] is the largest value, s[..., 1] is the
+      second largest, etc.
     u: Left singular vectors. If `full_matrices` is `False` (default) then
       shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
       `[..., M, M]`. Not returned if `compute_uv` is `False`.
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 9c49ef78896740a0a66e995f759e200b7ecf6cde..08e3f83a0b21a8444ad3500c62fe624440edc255 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -86,7 +86,7 @@ def histogram_summary(tag, values, collections=None, name=None):
   This ops is deprecated. Please switch to tf.summary.histogram.
 
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The generated
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
@@ -124,7 +124,7 @@ def image_summary(tag, tensor, max_images=3, collections=None, name=None):
   """Outputs a `Summary` protocol buffer with images.
 
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The summary has up to `max_images` summary values containing images. The
   images are built from `tensor` which must be 4-D with shape `[batch_size,
@@ -190,7 +190,7 @@ def audio_summary(tag,
 
   This op is deprecated. Please switch to tf.summary.audio.
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The summary has up to `max_outputs` summary values containing audio. The
   audio is built from `tensor` which must be 3-D with shape `[batch_size,
@@ -326,7 +326,7 @@ def scalar_summary(tags, values, collections=None, name=None):
 
   This ops is deprecated. Please switch to tf.summary.scalar.
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The input `tags` and `values` must have the same shape.  The generated
   summary has a summary value for each tag-value pair in `tags` and `values`.
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..82277ebaccbf32eb4e8935c97110301ccfb00d7c
--- /dev/null
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -0,0 +1,1215 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#==============================================================================
+"""Lookup operations."""
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_lookup_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated
+
+
+# TODO(yleon): Remove this function.
+@deprecated("2017-03-02", "Use `tf.tables_initializer` instead.")
+def initialize_all_tables(name="init_all_tables"):
+  """Returns an Op that initializes all tables of the default graph.
+
+  Args:
+    name: Optional name for the initialization op.
+
+  Returns:
+    An Op that initializes all tables.  Note that if there are
+    not tables the returned Op is a NoOp.
+  """
+  return tables_initializer(name)
+
+
+def tables_initializer(name="init_all_tables"):
+  """Returns an Op that initializes all tables of the default graph.
+
+  Args:
+    name: Optional name for the initialization op.
+
+  Returns:
+    An Op that initializes all tables.  Note that if there are
+    not tables the returned Op is a NoOp.
+  """
+  initializers = ops.get_collection(ops.GraphKeys.TABLE_INITIALIZERS)
+  if initializers:
+    return control_flow_ops.group(*initializers, name=name)
+  return control_flow_ops.no_op(name=name)
+
+
+def _check_table_dtypes(table, key_dtype, value_dtype):
+  """Check that the given key_dtype and value_dtype matches the table dtypes.
+
+  Args:
+    table: The table to check types against to.
+    key_dtype: The key data type to check.
+    value_dtype: The value data type to check.
+
+  Raises:
+    TypeError: when 'key_dtype' or 'value_dtype' doesn't match the table data
+      types.
+  """
+  if key_dtype != table.key_dtype:
+    raise TypeError("Invalid key dtype, expected %s but got %s." %
+                    (table.key_dtype, key_dtype))
+  if value_dtype != table.value_dtype:
+    raise TypeError("Invalid value dtype, expected %s but got %s." %
+                    (table.value_dtype, value_dtype))
+
+
+class LookupInterface(object):
+  """Represent a lookup table that persists across different steps."""
+
+  def __init__(self, key_dtype, value_dtype, name):
+    """Construct a lookup table interface.
+
+    Args:
+      key_dtype: The table key type.
+      value_dtype: The table value type.
+      name: A name for the operation (optional).
+    """
+    self._key_dtype = dtypes.as_dtype(key_dtype)
+    self._value_dtype = dtypes.as_dtype(value_dtype)
+    self._name = name
+
+  @property
+  def key_dtype(self):
+    """The table key dtype."""
+    return self._key_dtype
+
+  @property
+  def value_dtype(self):
+    """The table value dtype."""
+    return self._value_dtype
+
+  @property
+  def name(self):
+    """The name of the table."""
+    return self._name
+
+  @property
+  def init(self):
+    """The table initialization op."""
+    raise NotImplementedError
+
+  def size(self, name=None):
+    """Compute the number of elements in this table."""
+    raise NotImplementedError
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values."""
+    raise NotImplementedError
+
+
+class InitializableLookupTableBase(LookupInterface):
+  """Initializable lookup table interface.
+
+  An initializable lookup tables persist across different steps.
+  """
+
+  def __init__(self, table_ref, default_value, initializer):
+    """Construct a table object from a table reference.
+
+    If requires a table initializer object (subclass of `TableInitializerBase`).
+    It provides the table key and value types, as well as the op to initialize
+    the table. The caller is responsible to execute the initialization op.
+
+    Args:
+      table_ref: The table reference, i.e. the output of the lookup table ops.
+      default_value: The value to use if a key is missing in the table.
+      initializer: The table initializer to use.
+    """
+    super(InitializableLookupTableBase,
+          self).__init__(initializer.key_dtype, initializer.value_dtype,
+                         table_ref.op.name.split("/")[-1])
+    self._table_ref = table_ref
+    self._default_value = ops.convert_to_tensor(
+        default_value, dtype=self._value_dtype)
+    self._default_value.get_shape().merge_with(tensor_shape.scalar())
+    self._init = initializer.initialize(self)
+
+  @property
+  def table_ref(self):
+    """Get the underlying table reference."""
+    return self._table_ref
+
+  @property
+  def default_value(self):
+    """The default value of the table."""
+    return self._default_value
+
+  @property
+  def init(self):
+    """The table initialization op."""
+    return self._init
+
+  def size(self, name=None):
+    """Compute the number of elements in this table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A scalar tensor containing the number of elements in this table.
+    """
+    with ops.name_scope(name, "%s_Size" % self._name,
+                        [self._table_ref]) as scope:
+      # pylint: disable=protected-access
+      return gen_lookup_ops._lookup_table_size_v2(self._table_ref, name=scope)
+      # pylint: enable=protected-access
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values.
+
+    The `default_value` is used for keys not present in the table.
+
+    Args:
+      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+
+    Raises:
+      TypeError: when `keys` or `default_value` doesn't match the table data
+        types.
+    """
+    key_tensor = keys
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      key_tensor = keys.values
+
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(name, "%s_Lookup" % self._name,
+                        (self._table_ref, key_tensor,
+                         self._default_value)) as scope:
+      # pylint: disable=protected-access
+      values = gen_lookup_ops._lookup_table_find_v2(
+          self._table_ref, key_tensor, self._default_value, name=scope)
+      # pylint: enable=protected-access
+
+    values.set_shape(key_tensor.get_shape())
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(keys.indices, values, keys.dense_shape)
+    else:
+      return values
+
+
+class HashTable(InitializableLookupTableBase):
+  """A generic hash table implementation.
+
+  Example usage:
+
+  ```python
+  table = tf.contrib.lookup.HashTable(
+      tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1)
+  out = table.lookup(input_tensor).
+  table.init.run()
+  print out.eval()
+  ```
+  """
+
+  def __init__(self, initializer, default_value, shared_name=None, name=None):
+    """Creates a non-initialized `HashTable` object.
+
+    Creates a table, the type of its keys and values are specified by the
+    initializer.
+    Before using the table you will have to initialize it. After initialization
+    the table will be immutable.
+
+    Args:
+      initializer: The table initializer to use. See `HashTable` kernel for
+        supported key and value types.
+      default_value: The value to use if a key is missing in the table.
+      shared_name: If non-empty, this table will be shared under
+        the given name across multiple sessions.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `HashTable` object.
+    """
+    with ops.name_scope(name, "hash_table", (initializer,
+                                             default_value)) as scope:
+      # pylint: disable=protected-access
+      table_ref = gen_lookup_ops._hash_table_v2(
+          shared_name=shared_name,
+          key_dtype=initializer.key_dtype,
+          value_dtype=initializer.value_dtype,
+          name=scope)
+      # pylint: enable=protected-access
+
+      super(HashTable, self).__init__(table_ref, default_value, initializer)
+
+
+class TableInitializerBase(object):
+  """Base class for lookup table initializers."""
+
+  def __init__(self, key_dtype, value_dtype):
+    """Construct a table initializer object.
+
+    Args:
+      key_dtype: Type of the table keys.
+      value_dtype: Type of the table values.
+    """
+    self._key_dtype = dtypes.as_dtype(key_dtype)
+    self._value_dtype = dtypes.as_dtype(value_dtype)
+
+  @property
+  def key_dtype(self):
+    """The expected table key dtype."""
+    return self._key_dtype
+
+  @property
+  def value_dtype(self):
+    """The expected table value dtype."""
+    return self._value_dtype
+
+  def initialize(self, table):
+    """Returns the table initialization op."""
+    raise NotImplementedError
+
+
+class KeyValueTensorInitializer(TableInitializerBase):
+  """Table initializers given `keys` and `values` tensors."""
+
+  def __init__(self, keys, values, key_dtype=None, value_dtype=None, name=None):
+    """Constructs a table initializer object based on keys and values tensors.
+
+    Args:
+      keys: The tensor for the keys.
+      values: The tensor for the values.
+      key_dtype: The `keys` data type. Used when `keys` is a python array.
+      value_dtype: The `values` data type. Used when `values` is a python array.
+      name: A name for the operation (optional).
+    """
+    with ops.name_scope(name, "key_value_init", [keys, values]) as scope:
+      self._keys = ops.convert_to_tensor(keys, dtype=key_dtype, name="keys")
+      self._values = ops.convert_to_tensor(
+          values, dtype=value_dtype, name="values")
+      self._name = scope
+
+    super(KeyValueTensorInitializer, self).__init__(self._keys.dtype,
+                                                    self._values.dtype)
+
+  def initialize(self, table):
+    """Initializes the given `table` with `keys` and `values` tensors.
+
+    Args:
+      table: The table to initialize.
+
+    Returns:
+      The operation that initializes the table.
+
+    Raises:
+      TypeError: when the keys and values data types do not match the table
+      key and value data types.
+    """
+    _check_table_dtypes(table, self._keys.dtype, self._values.dtype)
+    with ops.name_scope(
+        self._name, values=(table.table_ref, self._keys,
+                            self._values)) as scope:
+      # pylint: disable=protected-access
+      init_op = gen_lookup_ops._initialize_table_v2(
+          table.table_ref, self._keys, self._values, name=scope)
+      # pylint: enable=protected-access
+    ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
+    return init_op
+
+
+class TextFileIndex(object):
+  WHOLE_LINE = -2
+  LINE_NUMBER = -1
+
+
+class TextFileInitializer(TableInitializerBase):
+  """Table initializers from a text file.
+
+  This initializer assigns one entry in the table for each line in the file.
+
+  The key and value type of the table to initialize is given by `key_dtype` and
+  `value_dtype`.
+
+  The key and value content to get from each line is specified by
+  the `key_index` and `value_index`.
+
+  * `TextFileIndex.LINE_NUMBER` means use the line number starting from zero,
+    expects data type int64.
+  * `TextFileIndex.WHOLE_LINE` means use the whole line content, expects data
+    type string.
+  * A value `>=0` means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+  For example if we have a file with the following content:
+
+  ```
+  emerson 10
+  lake 20
+  palmer 30
+  ```
+
+  The following snippet initializes a table with the first column as keys and
+  second column as values:
+
+  * `emerson -> 10`
+  * `lake -> 20`
+  * `palmer -> 30`
+
+  ```python
+  table = tf.contrib.lookup.HashTable(tf.contrib.lookup.TextFileInitializer(
+      "test.txt", tf.string, 0, tf.int64, 1, delimiter=" "), -1)
+  ...
+  table.init.run()
+  ```
+
+  Similarly to initialize the whole line as keys and the line number as values.
+
+  * `emerson 10 -> 0`
+  * `lake 20 -> 1`
+  * `palmer 30 -> 2`
+
+  ```python
+  table = tf.contrib.lookup.HashTable(tf.contrib.lookup.TextFileInitializer(
+      "test.txt", tf.string, tf.contrib.lookup.TextFileIndex.WHOLE_LINE,
+      tf.int64, tf.contrib.lookup.TextFileIndex.LINE_NUMBER, delimiter=" "), -1)
+  ...
+  table.init.run()
+  ```
+  """
+
+  def __init__(self,
+               filename,
+               key_dtype,
+               key_index,
+               value_dtype,
+               value_index,
+               vocab_size=None,
+               delimiter="\t",
+               name=None):
+    """Constructs a table initializer object to populate from a text file.
+
+    It generates one key-value pair per line. The type of table key and
+    value are specified by `key_dtype` and `value_dtype`, respectively.
+    Similarly the content of the key and value are specified by the key_index
+    and value_index.
+
+    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+      expects data type int64.
+    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+      type string.
+    - A value >=0 means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+    Args:
+      filename: The filename of the text file to be used for initialization.
+        The path must be accessible from wherever the graph is initialized
+        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      key_dtype: The `key` data type.
+      key_index: the index that represents information of a line to get the
+        table 'key' values from.
+      value_dtype: The `value` data type.
+      value_index: the index that represents information of a line to get the
+        table 'value' values from.'
+      vocab_size: The number of elements in the file, if known.
+      delimiter: The delimiter to separate fields in a line.
+      name: A name for the operation (optional).
+
+    Raises:
+      ValueError: when the filename is empty, or when the table key and value
+      data types do not match the expected data types.
+    """
+    if not isinstance(filename, ops.Tensor) and not filename:
+      raise ValueError("Filename required for %s." % name)
+
+    key_dtype = dtypes.as_dtype(key_dtype)
+    value_dtype = dtypes.as_dtype(value_dtype)
+
+    if key_index < -2:
+      raise ValueError("Invalid key index %s." % (key_index))
+
+    if key_index == TextFileIndex.LINE_NUMBER and key_dtype != dtypes.int64:
+      raise ValueError("Signature mismatch. Keys must be dtype %s, got %s." %
+                       (dtypes.int64, key_dtype))
+    if ((key_index == TextFileIndex.WHOLE_LINE) and
+        (not key_dtype.is_integer) and (key_dtype != dtypes.string)):
+      raise ValueError(
+          "Signature mismatch. Keys must be integer or string, got %s." %
+          key_dtype)
+    if value_index < -2:
+      raise ValueError("Invalid value index %s." % (value_index))
+
+    if value_index == TextFileIndex.LINE_NUMBER and value_dtype != dtypes.int64:
+      raise ValueError("Signature mismatch. Values must be dtype %s, got %s." %
+                       (dtypes.int64, value_dtype))
+    if value_index == TextFileIndex.WHOLE_LINE and value_dtype != dtypes.string:
+      raise ValueError("Signature mismatch. Values must be dtype %s, got %s." %
+                       (dtypes.string, value_dtype))
+
+    if (vocab_size is not None) and (vocab_size <= 0):
+      raise ValueError("Invalid vocab_size %s." % vocab_size)
+
+    self._filename = filename
+    self._key_index = key_index
+    self._value_index = value_index
+    self._vocab_size = vocab_size
+    self._delimiter = delimiter
+    self._name = name
+
+    super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
+
+  def initialize(self, table):
+    """Initializes the table from a text file.
+
+    Args:
+      table: The table to be initialized.
+
+    Returns:
+      The operation that initializes the table.
+
+    Raises:
+      TypeError: when the keys and values data types do not match the table
+      key and value data types.
+    """
+    _check_table_dtypes(table, self.key_dtype, self.value_dtype)
+    with ops.name_scope(self._name, "text_file_init",
+                        (table.table_ref,)) as scope:
+      filename = ops.convert_to_tensor(
+          self._filename, dtypes.string, name="asset_filepath")
+      # pylint: disable=protected-access
+      init_op = gen_lookup_ops._initialize_table_from_text_file_v2(
+          table.table_ref,
+          filename,
+          self._key_index,
+          self._value_index,
+          -1 if self._vocab_size is None else self._vocab_size,
+          self._delimiter,
+          name=scope)
+      # pylint: enable=protected-access
+    ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
+    ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
+    return init_op
+
+
+class TextFileStringTableInitializer(TextFileInitializer):
+  """Table initializer for `int64` IDs to string tables from a text file."""
+
+  def __init__(self,
+               filename,
+               key_column_index=TextFileIndex.LINE_NUMBER,
+               value_column_index=TextFileIndex.WHOLE_LINE,
+               vocab_size=None,
+               delimiter="\t",
+               name="text_file_string_table_init"):
+    """Constructs an initializer for an id-to-string table from a text file.
+
+    It populates a table that its key and value types are int64 and string,
+    respectively. It generates one key-value pair per line.
+    The content of the key and value are specified by `key_column_index`
+    and `value_column_index`.
+
+    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+      expects data type int64.
+    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+      type string.
+    - A value >=0 means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+    Args:
+      filename: The filename of the text file to be used for initialization.
+        The path must be accessible from wherever the graph is initialized
+        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      key_column_index: The column index from the text file to get the keys
+        from. The default is 0 that represents the whole line content.
+      value_column_index: The column index from the text file to get the
+        values from. The default is to use the line number, starting from zero.
+      vocab_size: The number of elements in the file, if known.
+      delimiter: The delimiter to separate fields in a line.
+      name: Optional name for the op.
+
+    Raises:
+      TypeError: when the filename is empty, or when the table key and value
+      data types do not match the expected data types.
+    """
+    super(TextFileStringTableInitializer, self).__init__(
+        filename,
+        dtypes.int64,
+        key_column_index,
+        dtypes.string,
+        value_column_index,
+        vocab_size=vocab_size,
+        delimiter=delimiter,
+        name=name)
+
+
+class TextFileIdTableInitializer(TextFileInitializer):
+  """Table initializer for string to `int64` IDs tables from a text file."""
+
+  def __init__(self,
+               filename,
+               key_column_index=TextFileIndex.WHOLE_LINE,
+               value_column_index=TextFileIndex.LINE_NUMBER,
+               vocab_size=None,
+               delimiter="\t",
+               name="text_file_id_table_init",
+               key_dtype=dtypes.string):
+    """Constructs an initializer for an string-to-id table from a text file.
+
+    It populates a table that its key and value types are string and int64,
+    respectively. It generates one key-value pair per line.
+    The content of the key and value are specified by the key_index
+    and value_index.
+
+    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+      expects data type int64.
+    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+      type string.
+    - A value >=0 means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+    Args:
+      filename: The filename of the text file to be used for initialization.
+        The path must be accessible from wherever the graph is initialized
+        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      key_column_index: The column index from the text file to get the `key`
+        values from. The default is to use the line number, starting from zero.
+      value_column_index: The column index from the text file ro get the `value`
+        values from. The default is 0 that represents the whole line content.
+      vocab_size: The number of elements in the file, if known.
+      delimiter: The delimiter to separate fields in a line.
+      name: Optional name for the op.
+      key_dtype: The `key` data type.
+
+    Raises:
+      TypeError: when the filename is empty, or when the table key and value
+      data types do not match the expected data types.
+    """
+    super(TextFileIdTableInitializer, self).__init__(
+        filename,
+        key_dtype,
+        key_column_index,
+        dtypes.int64,
+        value_column_index,
+        vocab_size=vocab_size,
+        delimiter=delimiter,
+        name=name)
+
+
+class HasherSpec(collections.namedtuple("HasherSpec", ["hasher", "key"])):
+  """A structure for the spec of the hashing function to use for hash buckets.
+
+  `hasher` is the name of the hashing function to use (eg. "fasthash",
+  "stronghash").
+  `key` is optional and specify the key to use for the hash function if
+  supported, currently only used by a strong hash.
+
+  Fields:
+    hasher: The hasher name to use.
+    key: The key to be used by the hashing function, if required.
+  """
+  __slots__ = ()
+
+
+FastHashSpec = HasherSpec("fasthash", None)  # pylint: disable=invalid-name
+
+
+class StrongHashSpec(HasherSpec):
+  """A structure to specify a key of the strong keyed hash spec.
+
+  The strong hash requires a `key`, which is a list of 2 unsigned integer
+  numbers. These should be non-zero; random numbers generated from random.org
+  would be a fine choice.
+
+  Fields:
+    key: The key to be used by the keyed hashing function.
+  """
+  __slots__ = ()
+
+  def __new__(cls, key):
+    if len(key) != 2:
+      raise ValueError("key must have size 2, got %s." % len(key))
+
+    if not isinstance(key[0], compat.integral_types) or not isinstance(
+        key[1], compat.integral_types):
+      raise TypeError("Invalid key %s. Must be unsigned integer values." % key)
+
+    return super(cls, StrongHashSpec).__new__(cls, "stronghash", key)
+
+
+def _as_string(tensor):
+  if dtypes.string == tensor.dtype.base_dtype:
+    return tensor
+  return string_ops.as_string(tensor)
+
+
+class IdTableWithHashBuckets(LookupInterface):
+  """String to Id table wrapper that assigns out-of-vocabulary keys to buckets.
+
+  For example, if an instance of `IdTableWithHashBuckets` is initialized with a
+  string-to-id table that maps:
+  - emerson -> 0
+  - lake -> 1
+  - palmer -> 2
+
+  The `IdTableWithHashBuckets` object will performs the following mapping:
+  - emerson -> 0
+  - lake -> 1
+  - palmer -> 2
+  - <other term> -> bucket id between 3 and 3 + num_oov_buckets, calculated by:
+    hash(<term>) % num_oov_buckets + vocab_size
+
+  If input_tensor is ["emerson", "lake", "palmer", "king", "crimson"],
+  the lookup result is [0, 1, 2, 4, 7]
+
+  If `table` is None, only out-of-vocabulary buckets are used.
+
+  Example usage:
+
+  ```python
+  num_oov_buckets = 3
+  input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"])
+  table = tf.IdTableWithHashBuckets(
+      tf.HashTable(tf.TextFileIdTableInitializer(filename), default_value),
+      num_oov_buckets)
+  out = table.lookup(input_tensor).
+  table.init.run()
+  print out.eval()
+  ```
+
+  The hash function used for generating out-of-vocabulary buckets ID is handled
+  by `hasher_spec`.
+  """
+
+  def __init__(self,
+               table,
+               num_oov_buckets,
+               hasher_spec=FastHashSpec,
+               name=None,
+               key_dtype=None):
+    """Construct a `IdTableWithHashBuckets` object.
+
+    Args:
+      table: Table that maps `tf.string` or `tf.int64` keys to `tf.int64` ids.
+      num_oov_buckets: Number of buckets to use for out-of-vocabulary keys.
+      hasher_spec: A `HasherSpec` to specify the hash function to use for
+        assignation of out-of-vocabulary buckets  (optional).
+      name: A name for the operation (optional).
+      key_dtype: Data type of keys passed to `lookup`. Defaults to
+        `table.key_dtype` if `table` is specified, otherwise `tf.string`.
+        Must be string or integer, and must be castable to `table.key_dtype`.
+
+    Raises:
+      ValueError: when `table` in None and `num_oov_buckets` is not positive.
+      TypeError: when `hasher_spec` is invalid.
+    """
+    # If a name ends with a '/' it is a "name scope", remove all trailing '/'
+    # characters to use as table name.
+    if name:
+      name = name.rstrip("/")
+    if table:
+      if key_dtype is None:
+        key_dtype = table.key_dtype
+      supported_table_key_dtypes = (dtypes.int64, dtypes.string)
+      if table.key_dtype not in supported_table_key_dtypes:
+        raise TypeError("Invalid key dtype, expected one of %s, but got %s." %
+                        (supported_table_key_dtypes, key_dtype))
+      if table.key_dtype.is_integer != key_dtype.is_integer:
+        raise TypeError("Invalid key dtype, expected %s but got %s." %
+                        ("integer" if key_dtype.is_integer else "non-integer",
+                         table.key_dtype))
+      if table.value_dtype != dtypes.int64:
+        raise TypeError("Invalid value dtype, expected %s but got %s." %
+                        (dtypes.int64, table.value_dtype))
+      self._table = table
+      name = name or self._table.name
+    else:
+      if num_oov_buckets <= 0:
+        raise ValueError("oov_buckets must be > 0 if no table is supplied.")
+      key_dtype = dtypes.string if key_dtype is None else key_dtype
+      self._table = None
+      name = name or "hash_bucket"
+    if (not key_dtype.is_integer) and (dtypes.string != key_dtype):
+      raise TypeError(
+          "Invalid key_dtype, expected integer or string, got %s." % key_dtype)
+    self._num_oov_buckets = num_oov_buckets
+
+    if not isinstance(hasher_spec, HasherSpec):
+      raise TypeError(
+          "hasher_spec must be of type HasherSpec, got %s" % hasher_spec)
+    self._hasher_spec = hasher_spec
+    super(IdTableWithHashBuckets, self).__init__(key_dtype, dtypes.int64,
+                                                 name.split("/")[-1])
+
+  @property
+  def init(self):
+    """The table initialization op."""
+    if self._table:
+      return self._table.init
+    with ops.name_scope(None, "init"):
+      return control_flow_ops.no_op()
+
+  def size(self, name=None):
+    """Compute the number of elements in this table."""
+    with ops.name_scope(name, "%s_Size" % self.name) as scope:
+      if self._table:
+        tsize = self._table.size(scope)
+      else:
+        tsize = ops.convert_to_tensor(0, dtype=dtypes.int64)
+      return tsize + self._num_oov_buckets
+
+  def _get_string_to_hash_bucket_fn(self, hasher_spec):
+    """Returns the string_to_hash_bucket op to use based on `hasher_spec`."""
+    if not isinstance(hasher_spec, HasherSpec):
+      raise TypeError("hasher_spec must be of type HasherSpec %s" % hasher_spec)
+    if hasher_spec.hasher == "fasthash":
+      return string_ops.string_to_hash_bucket_fast
+    if hasher_spec.hasher == "legacy":
+      return string_ops.string_to_hash_bucket
+    if hasher_spec.hasher == "stronghash":
+      return functools.partial(
+          string_ops.string_to_hash_bucket_strong, key=hasher_spec.key)
+    raise ValueError("Unknown hasher %s" % hasher_spec.hasher)
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in the table, outputs the corresponding values.
+
+    It assigns out-of-vocabulary keys to buckets based in their hashes.
+
+    Args:
+      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
+      name: Optional name for the op.
+
+    Returns:
+      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+
+    Raises:
+      TypeError: when `keys` doesn't match the table key data type.
+    """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+    values = keys
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      values = keys.values
+    if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
+      values = math_ops.to_int64(values)
+
+    if self._num_oov_buckets == 0:
+      ids = self._table.lookup(values, name=name)
+    else:
+      # TODO(yleon): Consider moving this functionality to its own kernel.
+      with ops.name_scope(name, "%s_Lookup" % self.name) as scope:
+        str_to_hash_bucket = self._get_string_to_hash_bucket_fn(
+            self._hasher_spec)
+        buckets = str_to_hash_bucket(
+            _as_string(values),
+            num_buckets=self._num_oov_buckets,
+            name="hash_bucket")
+        if self._table:
+          ids = self._table.lookup(values)
+          buckets = math_ops.add(buckets, self._table.size())
+          is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
+          ids = array_ops.where(is_id_non_default, ids, buckets, name=scope)
+        else:
+          ids = buckets
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
+    return ids
+
+
+def index_table_from_file(vocabulary_file=None,
+                          num_oov_buckets=0,
+                          vocab_size=None,
+                          default_value=-1,
+                          hasher_spec=FastHashSpec,
+                          key_dtype=dtypes.string,
+                          name=None):
+  """Returns a lookup table that converts a string tensor into int64 IDs.
+
+  This operation constructs a lookup table to convert tensor of strings into
+  int64 IDs. The mapping can be initialized from a vocabulary file specified in
+  `vocabulary_file`, where the whole line is the key and the zero-based line
+  number is the ID.
+
+  Any lookup of an out-of-vocabulary token will return a bucket ID based on its
+  hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
+  `default_value`.
+  The bucket ID range is `[vocabulary size, vocabulary size + num_oov_buckets]`.
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Sample Usages:
+
+  If we have a vocabulary file "test.txt" with the following content:
+
+  ```
+  emerson
+  lake
+  palmer
+  ```
+
+  ```python
+  features = tf.constant(["emerson", "lake", "and", "palmer"])
+  table = tf.contrib.lookup.index_table_from_file(
+      vocabulary_file="test.txt", num_oov_buckets=1)
+  ids = table.lookup(features)
+  ...
+  tf.tables_initializer().run()
+
+  ids.eval()  ==> [0, 1, 3, 2]  # where 3 is the out-of-vocabulary bucket
+  ```
+
+  Args:
+    vocabulary_file: The vocabulary filename.
+    num_oov_buckets: The number of out-of-vocabulary buckets.
+    vocab_size: Number of the elements in the vocabulary, if known.
+    default_value: The value to use for out-of-vocabulary feature values.
+      Defaults to -1.
+    hasher_spec: A `HasherSpec` to specify the hash function to use for
+      assignation of out-of-vocabulary buckets.
+    key_dtype: The `key` data type.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map a `key_dtype` `Tensor` to index `int64` `Tensor`.
+
+  Raises:
+    ValueError: If `vocabulary_file` is not set.
+    ValueError: If `num_oov_buckets` is negative or `vocab_size` is not greater
+      than zero.
+  """
+  if not vocabulary_file:
+    raise ValueError("vocabulary_file must be specified.")
+  if num_oov_buckets < 0:
+    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
+                     % num_oov_buckets)
+  if vocab_size is not None and vocab_size < 1:
+    raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
+  if (not key_dtype.is_integer) and (dtypes.string != key_dtype.base_dtype):
+    raise TypeError("Only integer and string keys are supported.")
+
+  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+    table = None
+    shared_name = ""
+    with ops.name_scope(None, "hash_table") as hash_table_scope:
+      if vocab_size:
+        # Keep the shared_name:
+        # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
+        shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
+                                                  TextFileIndex.WHOLE_LINE,
+                                                  TextFileIndex.LINE_NUMBER)
+      else:
+        # Keep the shared_name
+        # <table_type>_<filename>_<key_index>_<value_index>
+        shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
+                                               TextFileIndex.WHOLE_LINE,
+                                               TextFileIndex.LINE_NUMBER)
+      init = TextFileIdTableInitializer(
+          vocabulary_file,
+          vocab_size=vocab_size,
+          key_dtype=dtypes.int64 if key_dtype.is_integer else key_dtype,
+          name="table_init")
+
+      table = HashTable(
+          init, default_value, shared_name=shared_name, name=hash_table_scope)
+    if num_oov_buckets:
+      table = IdTableWithHashBuckets(
+          table,
+          num_oov_buckets=num_oov_buckets,
+          hasher_spec=hasher_spec,
+          name=feat_to_id_scope,
+          key_dtype=key_dtype)
+
+    return table
+
+
+def index_table_from_tensor(vocabulary_list,
+                            num_oov_buckets=0,
+                            default_value=-1,
+                            hasher_spec=FastHashSpec,
+                            dtype=dtypes.string,
+                            name=None):
+  """Returns a lookup table that converts a string tensor into int64 IDs.
+
+  This operation constructs a lookup table to convert tensor of strings into
+  int64 IDs. The mapping can be initialized from a string `vocabulary_list` 1-D
+  tensor where each element is a key and corresponding index within the tensor
+  is the value.
+
+  Any lookup of an out-of-vocabulary token will return a bucket ID based on its
+  hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
+  `default_value`.
+  The bucket ID range is `[mapping size, mapping size + num_oov_buckets]`.
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Elements in `mapping` cannot have duplicates, otherwise when executing the
+  table initializer op, it will throw a `FailedPreconditionError`.
+
+  Sample Usages:
+
+  ```python
+  vocabulary_list = t.constant(["emerson", "lake", "palmer")
+  table = tf.contrib.lookup.index_table_from_tensor(
+      vocabulary_list=vocabulary_list, num_oov_buckets=1, default_value=-1)
+  features = tf.constant(["emerson", "lake", "and", "palmer"])
+  ids = table.lookup(features)
+  ...
+  tf.tables_initializer().run()
+
+  ids.eval()  ==> [0, 1, 4, 2]
+  ```
+
+  Args:
+    vocabulary_list: A 1-D `Tensor` that specifies the mapping of keys to
+      indices. Thetype of this object must be castable to `dtype`.
+    num_oov_buckets: The number of out-of-vocabulary buckets.
+    default_value: The value to use for out-of-vocabulary feature values.
+      Defaults to -1.
+    hasher_spec: A `HasherSpec` to specify the hash function to use for
+      assignment of out-of-vocabulary buckets.
+    dtype: The type of values passed to `lookup`. Only string and integers are
+      supported.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map an input `Tensor` to index `int64` `Tensor`.
+
+  Raises:
+    ValueError: If `mapping` is invalid.
+    ValueError: If `num_oov_buckets` is negative.
+  """
+  if vocabulary_list is None:
+    raise ValueError("vocabulary_list must be specified.")
+
+  if num_oov_buckets < 0:
+    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
+                     % num_oov_buckets)
+
+  if (not dtype.is_integer) and (dtypes.string != dtype.base_dtype):
+    raise TypeError("Only integer and string keys are supported.")
+
+  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+    keys = ops.convert_to_tensor(vocabulary_list)
+    if keys.dtype.is_integer != dtype.is_integer:
+      raise ValueError("Expected %s, got %s." %
+                       ("integer"
+                        if dtype.is_integer else "non-integer", keys.dtype))
+    if (not dtype.is_integer) and (keys.dtype.base_dtype != dtype):
+      raise ValueError("Expected %s, got %s." % (dtype, keys.dtype))
+    num_elements = array_ops.size(keys)
+    values = math_ops.to_int64(math_ops.range(num_elements))
+
+    shared_name = ""
+    with ops.name_scope(None, "hash_table") as hash_table_scope:
+      table_keys = math_ops.to_int64(keys) if keys.dtype.is_integer else keys
+      init = KeyValueTensorInitializer(
+          table_keys,
+          values,
+          table_keys.dtype.base_dtype,
+          dtypes.int64,
+          name="table_init")
+      table = HashTable(
+          init, default_value, shared_name=shared_name, name=hash_table_scope)
+    if num_oov_buckets:
+      table = IdTableWithHashBuckets(
+          table,
+          num_oov_buckets=num_oov_buckets,
+          hasher_spec=hasher_spec,
+          name=feat_to_id_scope,
+          key_dtype=dtype)
+
+    return table
+
+
+def index_to_string_table_from_file(vocabulary_file,
+                                    vocab_size=None,
+                                    default_value="UNK",
+                                    name=None):
+  """Returns a lookup table that maps a `Tensor` of indices into strings.
+
+  This operation constructs a lookup table to map int64 indices into string
+  values. The table is initialized from a vocabulary file specified in
+  `vocabulary_file`, where the whole line is the value and the
+  zero-based line number is the index.
+
+  Any input which does not have a corresponding index in the vocabulary file
+  (an out-of-vocabulary entry) is assigned the `default_value`
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Sample Usages:
+
+  If we have a vocabulary file "test.txt" with the following content:
+
+  ```
+  emerson
+  lake
+  palmer
+  ```
+
+  ```python
+  indices = tf.constant([1, 5], tf.int64)
+  table = tf.contrib.lookup.index_to_string_table_from_file(
+      vocabulary_file="test.txt", default_value="UNKNOWN")
+  values = table.lookup(indices)
+  ...
+  tf.tables_initializer().run()
+
+  values.eval() ==> ["lake", "UNKNOWN"]
+  ```
+
+  Args:
+    vocabulary_file: The vocabulary filename.
+    vocab_size: Number of the elements in the vocabulary, if known.
+    default_value: The value to use for out-of-vocabulary indices.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map a string values associated to a given index `int64`
+    `Tensors`.
+
+  Raises:
+    ValueError: when `vocabulary_file` is empty.
+    ValueError: when `vocab_size` is invalid.
+  """
+  if not vocabulary_file:
+    raise ValueError("vocabulary_file must be specified.")
+  if vocab_size is not None and vocab_size < 1:
+    raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
+
+  with ops.name_scope(name, "index_to_string") as scope:
+    shared_name = ""
+    if vocab_size:
+      # Keep a shared_name
+      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
+                                                TextFileIndex.LINE_NUMBER,
+                                                TextFileIndex.WHOLE_LINE)
+    else:
+      # Keep a shared_name <table_type>_<filename>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
+                                             TextFileIndex.LINE_NUMBER,
+                                             TextFileIndex.WHOLE_LINE)
+    init = TextFileStringTableInitializer(
+        vocabulary_file, vocab_size=vocab_size, name="table_init")
+
+    # TODO(yleon): Use a more effienct structure.
+    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+
+
+def index_to_string_table_from_tensor(vocabulary_list,
+                                      default_value="UNK",
+                                      name=None):
+  """Returns a lookup table that maps a `Tensor` of indices into strings.
+
+  This operation constructs a lookup table to map int64 indices into string
+  values. The mapping is initialized from a string `mapping` 1-D `Tensor` where
+  each element is a value and the corresponding index within the tensor is the
+  key.
+
+  Any input which does not have a corresponding index in 'mapping'
+  (an out-of-vocabulary entry) is assigned the `default_value`
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Elements in `mapping` cannot have duplicates, otherwise when executing the
+  table initializer op, it will throw a `FailedPreconditionError`.
+
+  Sample Usages:
+
+  ```python
+  vocabulary_list = t.constant(["emerson", "lake", "palmer")
+  indices = tf.constant([1, 5], tf.int64)
+  table = tf.contrib.lookup.index_to_string_table_from_tensor(
+      vocabulary_list, default_value="UNKNOWN")
+  values = table.lookup(indices)
+  ...
+  tf.tables_initializer().run()
+
+  values.eval() ==> ["lake", "UNKNOWN"]
+  ```
+
+  Args:
+    vocabulary_list: A 1-D string `Tensor` that specifies the strings to map
+      from indices.
+    default_value: The value to use for out-of-vocabulary indices.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map a string values associated to a given index `int64`
+    `Tensors`.
+
+  Raises:
+    ValueError: when `vocabulary_list` is not set.
+  """
+
+  if vocabulary_list is None:
+    raise ValueError("vocabulary_list must be specified.")
+
+  with ops.name_scope(name, "index_to_string") as scope:
+    vocabulary_list = ops.convert_to_tensor(vocabulary_list, dtypes.string)
+    num_elements = array_ops.size(vocabulary_list)
+    keys = math_ops.to_int64(math_ops.range(num_elements))
+
+    shared_name = ""
+    init = KeyValueTensorInitializer(
+        keys, vocabulary_list, dtypes.int64, dtypes.string, name="table_init")
+    # TODO(yleon): Use a more effienct structure.
+    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+
+
+ops.NotDifferentiable("LookupTableFind")
+ops.NotDifferentiable("LookupTableFindV2")
+ops.NotDifferentiable("LookupTableInsert")
+ops.NotDifferentiable("LookupTableInsertV2")
+ops.NotDifferentiable("LookupTableSize")
+ops.NotDifferentiable("LookupTableSizeV2")
+ops.NotDifferentiable("HashTable")
+ops.NotDifferentiable("HashTableV2")
+ops.NotDifferentiable("InitializeTable")
+ops.NotDifferentiable("InitializeTableV2")
+ops.NotDifferentiable("InitializeTableFromTextFile")
+ops.NotDifferentiable("InitializeTableFromTextFileV2")
+ops.NotDifferentiable("MutableDenseHashTable")
+ops.NotDifferentiable("MutableDenseHashTableV2")
+ops.NotDifferentiable("MutableHashTable")
+ops.NotDifferentiable("MutableHashTableV2")
+ops.NotDifferentiable("MutableHashTableOfTensors")
+ops.NotDifferentiable("MutableHashTableOfTensorsV2")
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 15d9d86f013a45bcb19fda5cda1de5bedcc1f8f8..3414df475f5744f11482add71b403fb7d86c2265 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -57,13 +57,13 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
 
 
 def get_regularization_losses(scope=None):
-  """Gets the regularization losses.
+  """Gets the list of regularization losses.
 
   Args:
     scope: An optional scope for filtering the losses to return.
 
   Returns:
-    A list of loss variables.
+    A list of regularization losses as Tensors.
   """
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
@@ -88,7 +88,11 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"):
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
-  Notice that the function adds the given losses to the regularization losses.
+  In particular, this adds any losses you have added with `tf.add_loss()` to
+  any regularization losses that have been added by regularization parameters
+  on layers constructors e.g. `tf.layers`. Be very sure to use this if you
+  are constructing a loss_op manually. Otherwise regularization arguments
+  on `tf.layers` methods will not function.
 
   Args:
     add_regularization_losses: A boolean indicating whether or not to use the
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 86e45ac8608df896794d62090fb4204e9c1685fd..1555d19395fc79ce40b48193e53466c6f143e7d2 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -242,6 +242,12 @@ def abs(x, name=None):
 # pylint: enable=g-docstring-has-escape
 
 
+# pylint: disable=redefined-builtin
+def _bucketize(input, boundaries, name=None):
+  return gen_math_ops._bucketize(input=input, boundaries=boundaries, name=name)
+# pylint: enable=redefined-builtin
+
+
 class DivideDelegateWithName(object):
   """Use Python2/Python3 division delegation to implement divide for tensors."""
 
@@ -1076,8 +1082,6 @@ _OverrideBinaryOperatorHelper(_mul_dispatch, "mul")
 _OverrideBinaryOperatorHelper(_div_python2, "div")
 _OverrideBinaryOperatorHelper(_truediv_python3, "truediv")
 _OverrideBinaryOperatorHelper(floordiv, "floordiv")
-# TODO(aselle): Switch mod to floor_mod when ready
-# _OverrideBinaryOperatorHelper(gen_math_ops.floor_mod, "mod")
 _OverrideBinaryOperatorHelper(gen_math_ops._floor_mod, "mod")
 _OverrideBinaryOperatorHelper(pow, "pow")
 
@@ -1930,6 +1934,12 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   NOTE: This operation is not differentiable and cannot be used if inputs depend
   on trainable variables. Please use `tf.add_n` for such cases.
 
+  Aside from differentiability, `tf.accumulate_n` performs the same operation as
+  `tf.add_n`, but does not wait for all of its inputs to be ready before
+  beginning to sum. This can save memory if inputs are ready at different times,
+  since minimum temporary storage is proportional to the output size rather than
+  the inputs size.
+
   For example:
 
   ```python
@@ -2319,7 +2329,7 @@ def tensordot(a, b, axes, name=None):
     using `array_ops.transpose` and `array_ops.reshape`. The method takes a
     tensor and performs the correct transpose and reshape operation for a given
     set of indices. It returns the reshaped tensor as well as a list of indices
-    necesary to reshape the tensor again after matrix multiplication.
+    necessary to reshape the tensor again after matrix multiplication.
 
     Args:
       a: `Tensor`.
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 7dbc8efe16aab6f7618b3d4ccc93fb019f9b15c9..120827d18b093f6e4dc877692ace4509406e38db 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -28,6 +29,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
+ops._USE_C_API = True
+
 exp = np.exp
 log = np.log
 
@@ -53,7 +56,8 @@ class ReduceTest(test_util.TensorFlowTestCase):
   def testReduceInvalidAxis(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     axis = np.array([[0], [1]])
-    with self.assertRaisesRegexp(ValueError, "must be at most rank 1"):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must be at most rank 1"):
       math_ops.reduce_sum(x, axis)
 
 
@@ -278,7 +282,8 @@ class AddNTest(test_util.TensorFlowTestCase):
     for _ in range(98):
       partials.append(math_ops.add_n([constant_op.constant(1)]))
     partials.append(
-        math_ops.add_n([constant_op.constant(1), constant_op.constant(1)]))
+        math_ops.add_n([constant_op.constant(1),
+                        constant_op.constant(1)]))
 
     res = math_ops.add_n(partials) + constant_op.constant(0)
     with self.test_session(use_gpu=True):
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 4dc8e702ca32939f6a1a2de2a59382abcf63b23a..0d35f50894f5d4f860fd6a7966c4e0252c80ffe1 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -1735,7 +1735,7 @@ def _streaming_sparse_true_positive_at_k(labels,
     A tuple of `Variable` and update `Operation`.
 
   Raises:
-    ValueError: If `weights` is not `None` and has an incomptable shape.
+    ValueError: If `weights` is not `None` and has an incompatible shape.
   """
   with ops.name_scope(
       name, _at_k_name('true_positive', k, class_id=class_id),
@@ -1831,7 +1831,7 @@ def _streaming_sparse_false_negative_at_k(labels,
     A tuple of `Variable` and update `Operation`.
 
   Raises:
-    ValueError: If `weights` is not `None` and has an incomptable shape.
+    ValueError: If `weights` is not `None` and has an incompatible shape.
   """
   with ops.name_scope(
       name, _at_k_name('false_negative', k, class_id=class_id),
@@ -1924,7 +1924,74 @@ def recall_at_k(labels,
     labels = _maybe_expand_labels(labels, predictions)
 
     _, top_k_idx = nn.top_k(predictions, k)
-    top_k_idx = math_ops.to_int64(top_k_idx)
+    return _sparse_recall_at_top_k(
+        labels=labels,
+        predictions_idx=top_k_idx,
+        k=k,
+        class_id=class_id,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=scope)
+
+
+def _sparse_recall_at_top_k(labels,
+                            predictions_idx,
+                            k=None,
+                            class_id=None,
+                            weights=None,
+                            metrics_collections=None,
+                            updates_collections=None,
+                            name=None):
+  """Computes recall@k of top-k predictions with respect to sparse labels.
+
+  Differs from `recall_at_k` in that predictions must be in the form of top `k`
+  class indices, whereas `recall_at_k` expects logits. Refer to `recall_at_k`
+  for more details.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels] or [D1, ... DN], where the latter implies
+      num_labels=1. N >= 1 and num_labels is the number of target classes for
+      the associated prediction. Commonly, N=1 and `labels` has shape
+      [batch_size, num_labels]. [D1, ... DN] must match `predictions`. Values
+      should be in range [0, num_classes), where num_classes is the last
+      dimension of `predictions`. Values outside this range always count
+      towards `false_negative_at_<k>`.
+    predictions_idx: Integer `Tensor` with shape [D1, ... DN, k] where N >= 1.
+      Commonly, N=1 and predictions has shape [batch size, k]. The final
+      dimension contains the top `k` predicted class indices. [D1, ... DN] must
+      match `labels`.
+    k: Integer, k for @k metric.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If class_id is outside this range, the method returns NAN.
+    weights: `Tensor` whose rank is either 0, or n-1, where n is the rank of
+      `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
+      dimensions must be either `1`, or the same as the corresponding `labels`
+      dimension).
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately, and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+  """
+  with ops.name_scope(name,
+                      _at_k_name('recall', k, class_id=class_id),
+                      (predictions_idx, labels, weights)) as scope:
+    top_k_idx = math_ops.to_int64(predictions_idx)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
         predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
         weights=weights)
@@ -2586,7 +2653,7 @@ def _streaming_sparse_false_positive_at_k(labels,
     A tuple of `Variable` and update `Operation`.
 
   Raises:
-    ValueError: If `weights` is not `None` and has an incomptable shape.
+    ValueError: If `weights` is not `None` and has an incompatible shape.
   """
   with ops.name_scope(
       name, _at_k_name('false_positive', k, class_id=class_id),
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index b1f50fd3418d2ede63661de1bd1ad9448f451e85..028d82aa4da50a506d80cd24b9fd8e0c7fa584e1 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -349,6 +349,19 @@ def _SoftplusGrad(op, grad):
   return gen_nn_ops._softplus_grad(grad, op.inputs[0])
 
 
+@ops.RegisterGradient("SoftplusGrad")
+def _SoftplusGradGrad(op, grad):
+  # Let:
+  #   y = tf.nn.softplus(x)
+  #   dx = gen_nn_ops._softplus_grad(dy, x) = dy / (1 + exp(-x))
+  # This op computes (ddy, d2x) from op.inputs == [dy, x] and grad == ddx.
+  dy, x = op.inputs
+  with ops.control_dependencies([grad.op]):
+    ddy = gen_nn_ops._softplus_grad(grad, x)  # pylint: disable=protected-access
+    d2x = grad * dy / (math_ops.exp(-x) + 2.0 + math_ops.exp(x))
+    return (ddy, d2x)
+
+
 @ops.RegisterGradient("Softsign")
 def _SoftsignGrad(op, grad):
   return gen_nn_ops._softsign_grad(grad, op.inputs[0])
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 7c17cf2cb614a2673219bc4e346c572563a5d98d..0a00e3d76508b9e910e755203337940d9b53a6ea 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -301,9 +301,8 @@ def zero_fraction(value, name=None):
   This is useful in summaries to measure and report sparsity.  For example,
 
   ```python
-      z = tf.Relu(...)
-      summ = tf.contrib.deprecated.scalar_summary('sparsity',
-      tf.nn.zero_fraction(z))
+      z = tf.nn.relu(...)
+      summ = tf.summary.scalar('sparsity', tf.nn.zero_fraction(z))
   ```
 
   Args:
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 9b54b937540f577175c443fd293176f9f8e3fd43..b76f1f4be3e9884fa9b128a398fc60973647a025 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -840,6 +840,11 @@ def pool(input,  # pylint: disable=redefined-builtin
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
 
+  This function is a simpler wrapper around the more general
+  @{tf.nn.convolution}, and exists only for backwards compatibility. You can
+  use @{tf.nn.convolution} to perform 1-D, 2-D, or 3-D atrous convolution.
+
+
   Computes a 2-D atrous convolution, also known as convolution with holes or
   dilated convolution, given 4-D `value` and `filters` tensors. If the `rate`
   parameter is equal to one, it performs regular 2-D convolution. If the `rate`
@@ -959,93 +964,12 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
     ValueError: If input/output depth does not match `filters`' shape, or if
       padding is other than `'VALID'` or `'SAME'`.
   """
-  with ops.name_scope(name, "atrous_conv2d", [value, filters]) as name:
-    value = ops.convert_to_tensor(value, name="value")
-    filters = ops.convert_to_tensor(filters, name="filters")
-    if not value.get_shape()[3].is_compatible_with(filters.get_shape()[2]):
-      raise ValueError(
-          "value's input channels does not match filters' input channels, "
-          "{} != {}".format(value.get_shape()[3], filters.get_shape()[2]))
-    if rate < 1:
-      raise ValueError("rate {} cannot be less than one".format(rate))
-
-    if rate == 1:
-      value = gen_nn_ops.conv2d(input=value,
-                                filter=filters,
-                                strides=[1, 1, 1, 1],
-                                padding=padding)
-      return value
-
-    # We have two padding contributions. The first is used for converting "SAME"
-    # to "VALID". The second is required so that the height and width of the
-    # zero-padded value tensor are multiples of rate.
-
-    # Padding required to reduce to "VALID" convolution
-    if padding == "SAME":
-      # Handle filters whose shape is unknown during graph creation.
-      if filters.get_shape().is_fully_defined():
-        filter_shape = filters.get_shape().as_list()
-      else:
-        filter_shape = array_ops.shape(filters)
-      filter_height, filter_width = filter_shape[0], filter_shape[1]
-
-      # Spatial dimensions of the filters and the upsampled filters in which we
-      # introduce (rate - 1) zeros between consecutive filter values.
-      filter_height_up = filter_height + (filter_height - 1) * (rate - 1)
-      filter_width_up = filter_width + (filter_width - 1) * (rate - 1)
-
-      pad_height = filter_height_up - 1
-      pad_width = filter_width_up - 1
-
-      # When pad_height (pad_width) is odd, we pad more to bottom (right),
-      # following the same convention as conv2d().
-      pad_top = pad_height // 2
-      pad_bottom = pad_height - pad_top
-      pad_left = pad_width // 2
-      pad_right = pad_width - pad_left
-    elif padding == "VALID":
-      pad_top = 0
-      pad_bottom = 0
-      pad_left = 0
-      pad_right = 0
-    else:
-      raise ValueError("Invalid padding")
-
-    # Handle input whose shape is unknown during graph creation.
-    if value.get_shape().is_fully_defined():
-      value_shape = value.get_shape().as_list()
-    else:
-      value_shape = array_ops.shape(value)
-
-    in_height = value_shape[1] + pad_top + pad_bottom
-    in_width = value_shape[2] + pad_left + pad_right
-
-    # More padding so that rate divides the height and width of the input.
-    pad_bottom_extra = (rate - in_height % rate) % rate
-    pad_right_extra = (rate - in_width % rate) % rate
-
-    # The paddings argument to space_to_batch includes both padding components.
-    space_to_batch_pad = [[pad_top, pad_bottom + pad_bottom_extra],
-                          [pad_left, pad_right + pad_right_extra]]
-
-    value = array_ops.space_to_batch(input=value,
-                                     paddings=space_to_batch_pad,
-                                     block_size=rate)
-
-    value = gen_nn_ops.conv2d(input=value,
-                              filter=filters,
-                              strides=[1, 1, 1, 1],
-                              padding="VALID",
-                              name=name)
-
-    # The crops argument to batch_to_space is just the extra padding component.
-    batch_to_space_crop = [[0, pad_bottom_extra], [0, pad_right_extra]]
-
-    value = array_ops.batch_to_space(input=value,
-                                     crops=batch_to_space_crop,
-                                     block_size=rate)
-
-    return value
+  return convolution(
+      input=value,
+      filter=filters,
+      padding=padding,
+      dilation_rate=np.broadcast_to(rate, (2,)),
+      name=name)
 
 
 def conv2d_transpose(value,
@@ -1311,8 +1235,8 @@ def conv3d_transpose(value,
     axis = 1 if data_format == "NCDHW" else 4
     if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[4]):
       raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis], filter.get_shape(
-                       )[4]))
+                       "{} != {}".format(value.get_shape()[axis],
+                                         filter.get_shape()[4]))
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
@@ -1400,7 +1324,7 @@ def crelu(features, name=None):
   Concatenates a ReLU which selects only the positive part of the activation
   with a ReLU which selects only the *negative* part of the activation.
   Note that as a result this non-linearity doubles the depth of the activations.
-  Source: https://arxiv.org/abs/1603.05201
+  Source: [Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units. W. Shang, et al.](https://arxiv.org/abs/1603.05201) 
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1418,6 +1342,7 @@ def crelu(features, name=None):
 
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
+  Source: [Convolutional Deep Belief Networks on CIFAR-10. A. Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 796ea20eb764f0af867f5b35adb007fdcb988cbb..c2f9961731630173127ed5367789a31550ce1c0d 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -58,7 +58,7 @@ class SparseFeature(
         ["index_key", "value_key", "dtype", "size", "already_sorted"])):
   """Configuration for parsing a sparse input feature from an `Example`.
 
-  Note, preferrably use `VarLenFeature` (possibly in combination with a
+  Note, preferably use `VarLenFeature` (possibly in combination with a
   `SequenceExample`) in order to parse out `SparseTensor`s instead of
   `SparseFeature` due to its simplicity.
 
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index f06980783a50e2679afdd678043349528c9c23b9..15613289a0b0a96b4631c60afa9d196da408480d 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -300,7 +300,8 @@ def random_crop(value, size, seed=None, name=None):
     shape = array_ops.shape(value)
     check = control_flow_ops.Assert(
         math_ops.reduce_all(shape >= size),
-        ["Need value.shape >= size, got ", shape, size])
+        ["Need value.shape >= size, got ", shape, size],
+        summarize=1000)
     shape = control_flow_ops.with_dependencies([check], shape)
     limit = shape - size + 1
     offset = random_uniform(
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 4b560e62b8c88b7c2fc1f1eafaa1b77192237206..842bb48d0f9a1aa31edb589971c2d307063e3fbe 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import variables
 # go/tf-wildcard-import
@@ -256,6 +257,11 @@ class ResourceVariable(object):
     """The device this variable is on."""
     return self._handle.device
 
+  @property
+  def graph(self):
+    """The `Graph` of this variable."""
+    return self._handle.graph
+
   @property
   def name(self):
     """The name of the handle for this variable."""
@@ -430,6 +436,31 @@ class ResourceVariable(object):
             ops.convert_to_tensor(value, dtype=self.dtype), name=name)]):
       return self.read_value()
 
+  def _strided_slice_assign(self,
+                            begin,
+                            end,
+                            strides,
+                            value,
+                            name,
+                            begin_mask,
+                            end_mask,
+                            ellipsis_mask,
+                            new_axis_mask,
+                            shrink_axis_mask):
+    with ops.control_dependencies([gen_array_ops.resource_strided_slice_assign(
+        ref=self.handle,
+        begin=begin,
+        end=end,
+        strides=strides,
+        value=value,
+        name=name,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        ellipsis_mask=ellipsis_mask,
+        new_axis_mask=new_axis_mask,
+        shrink_axis_mask=shrink_axis_mask)]):
+      return self.value()
+
 
 # pylint: disable=unused-argument,protected-access
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/ops/resources.py b/tensorflow/python/ops/resources.py
index 41fb8a74a9e16984d115866a4e5123ed296e164a..57ba0084e846a612ba3deedb600f53c123545571 100644
--- a/tensorflow/python/ops/resources.py
+++ b/tensorflow/python/ops/resources.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import tf_should_use
 
 
 _Resource = collections.namedtuple("_Resource",
@@ -98,6 +99,7 @@ def report_uninitialized_resources(resource_list=None,
     return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
 
+@tf_should_use.should_use_result
 def initialize_resources(resource_list, name="init"):
   """Initializes the resources in the given list.
 
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 1051478a7f7fc59aa6487f006f1e96f6067ae8f5..2aa288e36ac856fea4777169fbadaaeaf90f91c8 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -33,7 +33,8 @@ from tensorflow.python.util import nest
 
 
 # pylint: disable=protected-access
-_state_size_with_prefix = rnn_cell_impl._state_size_with_prefix
+_concat = rnn_cell_impl._concat
+_like_rnncell = rnn_cell_impl._like_rnncell
 # pylint: enable=protected-access
 
 
@@ -288,11 +289,10 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
                               swap_memory=False, time_major=False, scope=None):
   """Creates a dynamic version of bidirectional recurrent neural network.
 
-  Similar to the unidirectional case above (rnn) but takes input and builds
-  independent forward and backward RNNs. The input_size of forward and
-  backward cell must match. The initial state for both directions is zero by
-  default (but can be set optionally) and no intermediate states are ever
-  returned -- the network is fully unrolled for the given (passed in)
+  Takes input and builds independent forward and backward RNNs. The input_size
+  of forward and backward cell must match. The initial state for both directions
+  is zero by default (but can be set optionally) and no intermediate states are
+  ever returned -- the network is fully unrolled for the given (passed in)
   length(s) of the sequence(s) or completely unrolled if length(s) is not
   given.
 
@@ -362,12 +362,10 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
   """
 
-  # pylint: disable=protected-access
-  if not isinstance(cell_fw, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell_fw):
     raise TypeError("cell_fw must be an instance of RNNCell")
-  if not isinstance(cell_bw, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell_bw):
     raise TypeError("cell_bw must be an instance of RNNCell")
-  # pylint: enable=protected-access
 
   with vs.variable_scope(scope or "bidirectional_rnn"):
     # Forward direction
@@ -419,12 +417,10 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
                 time_major=False, scope=None):
   """Creates a recurrent neural network specified by RNNCell `cell`.
 
-  This function is functionally identical to the function `rnn` above, but
-  performs fully dynamic unrolling of `inputs`.
+  Performs fully dynamic unrolling of `inputs`.
 
-  Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for
-  each frame.  Instead, `inputs` may be a single `Tensor` where
-  the maximum time is either the first or second dimension (see the parameter
+  `Inputs` may be a single `Tensor` where the maximum time is either the first
+  or second dimension (see the parameter
   `time_major`).  Alternatively, it may be a (possibly nested) tuple of
   Tensors, each of them having matching batch and time dimensions.
   The corresponding output is either a single `Tensor` having the same number
@@ -433,7 +429,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
 
   The parameter `sequence_length` is optional and is used to copy-through state
   and zero-out outputs when past a batch element's sequence length. So it's more
-  for correctness than performance, unlike in rnn().
+  for correctness than performance.
 
   Args:
     cell: An instance of RNNCell.
@@ -510,10 +506,8 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
     ValueError: If inputs is None or an empty list.
   """
 
-  # pylint: disable=protected-access
-  if not isinstance(cell, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
-  # pylint: enable=protected-access
 
   # By default, time_major==False and inputs are batch-major: shaped
   #   [batch, time, depth]
@@ -663,7 +657,7 @@ def _dynamic_rnn_loop(cell,
 
   # Prepare dynamic conditional copying of state & output
   def _create_zero_arrays(size):
-    size = _state_size_with_prefix(size, prefix=[batch_size])
+    size = _concat(batch_size, size)
     return array_ops.zeros(
         array_ops.stack(size), _infer_state_dtype(dtype, state))
 
@@ -749,8 +743,8 @@ def _dynamic_rnn_loop(cell,
 
   # Restore some shape information
   for output, output_size in zip(final_outputs, flat_output_size):
-    shape = _state_size_with_prefix(
-        output_size, prefix=[const_time_steps, const_batch_size])
+    shape = _concat(
+        [const_time_steps, const_batch_size], output_size, static=True)
     output.set_shape(shape)
 
   final_outputs = nest.pack_sequence_as(
@@ -924,10 +918,8 @@ def raw_rnn(cell, loop_fn,
       a `callable`.
   """
 
-  # pylint: disable=protected-access
-  if not isinstance(cell, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
-  # pylint: enable=protected-access
   if not callable(loop_fn):
     raise TypeError("loop_fn must be a callable")
 
@@ -984,9 +976,7 @@ def raw_rnn(cell, loop_fn,
     emit_ta = nest.pack_sequence_as(structure=emit_structure,
                                     flat_sequence=flat_emit_ta)
     flat_zero_emit = [
-        array_ops.zeros(
-            _state_size_with_prefix(size_i, prefix=[batch_size]),
-            dtype_i)
+        array_ops.zeros(_concat(batch_size, size_i), dtype_i)
         for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes)]
     zero_emit = nest.pack_sequence_as(structure=emit_structure,
                                       flat_sequence=flat_zero_emit)
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 32ebe0c2e84c5728f6c3ad49104c004b41aeaa15..9c0fb1db23dbd16ce5faffa68daede7d13accd59 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -24,62 +24,104 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import nest
 
 
-def _state_size_with_prefix(state_size, prefix=None):
-  """Helper function that enables int or TensorShape shape specification.
+def _like_rnncell(cell):
+  """Checks that a given object is an RNNCell by using duck typing."""
+  conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
+                hasattr(cell, "zero_state"), callable(cell)]
+  return all(conditions)
 
-  This function takes a size specification, which can be an integer or a
-  TensorShape, and converts it into a list of integers. One may specify any
-  additional dimensions that precede the final state size specification.
+
+def _concat(prefix, suffix, static=False):
+  """Concat that enables int, Tensor, or TensorShape values.
+
+  This function takes a size specification, which can be an integer, a
+  TensorShape, or a Tensor, and converts it into a concatenated Tensor
+  (if static = False) or a list of integers (if static = True).
 
   Args:
-    state_size: TensorShape or int that specifies the size of a tensor.
-    prefix: optional additional list of dimensions to prepend.
+    prefix: The prefix; usually the batch size (and/or time step size).
+      (TensorShape, int, or Tensor.)
+    suffix: TensorShape, int, or Tensor.
+    static: If `True`, return a python list with possibly unknown dimensions.
+      Otherwise return a `Tensor`.
 
   Returns:
-    result_state_size: list of dimensions the resulting tensor size.
+    shape: the concatenation of prefix and suffix.
+
+  Raises:
+    ValueError: if `suffix` is not a scalar or vector (or TensorShape).
+    ValueError: if prefix or suffix was `None` and asked for dynamic
+      Tensors out.
   """
-  result_state_size = tensor_shape.as_shape(state_size).as_list()
-  if prefix is not None:
-    if not isinstance(prefix, list):
-      raise TypeError("prefix of _state_size_with_prefix should be a list.")
-    result_state_size = prefix + result_state_size
-  return result_state_size
+  if isinstance(prefix, ops.Tensor):
+    p = prefix
+    p_static = tensor_util.constant_value(prefix)
+    if p.shape.ndims == 0:
+      p = array_ops.expand_dims(p, 0)
+    elif p.shape.ndims != 1:
+      raise ValueError("prefix tensor must be either a scalar or vector, "
+                       "but saw tensor: %s" % p)
+  else:
+    p = tensor_shape.as_shape(prefix)
+    p_static = p.as_list() if p.ndims is not None else None
+    p = (constant_op.constant(p.as_list(), dtype=dtypes.int32)
+         if p.is_fully_defined() else None)
+  if isinstance(suffix, ops.Tensor):
+    s = suffix
+    s_static = tensor_util.constant_value(suffix)
+    if s.shape.ndims == 0:
+      s = array_ops.expand_dims(s, 0)
+    elif s.shape.ndims != 1:
+      raise ValueError("suffix tensor must be either a scalar or vector, "
+                       "but saw tensor: %s" % s)
+  else:
+    s = tensor_shape.as_shape(suffix)
+    s_static = s.as_list() if s.ndims is not None else None
+    s = (constant_op.constant(s.as_list(), dtype=dtypes.int32)
+         if s.is_fully_defined() else None)
+
+  if static:
+    shape = tensor_shape.as_shape(p_static).concatenate(s_static)
+    shape = shape.as_list() if shape.ndims is not None else None
+  else:
+    if p is None or s is None:
+      raise ValueError("Provided a prefix or suffix of None: %s and %s"
+                       % (prefix, suffix))
+    shape = array_ops.concat((p, s), 0)
+  return shape
 
 
 def _zero_state_tensors(state_size, batch_size, dtype):
   """Create tensors of zeros based on state_size, batch_size, and dtype."""
-  if nest.is_sequence(state_size):
-    state_size_flat = nest.flatten(state_size)
-    zeros_flat = [
-        array_ops.zeros(
-            array_ops.stack(_state_size_with_prefix(
-                s, prefix=[batch_size])),
-            dtype=dtype) for s in state_size_flat
-    ]
-    for s, z in zip(state_size_flat, zeros_flat):
-      z.set_shape(_state_size_with_prefix(s, prefix=[None]))
-    zeros = nest.pack_sequence_as(structure=state_size,
-                                  flat_sequence=zeros_flat)
-  else:
-    zeros_size = _state_size_with_prefix(state_size, prefix=[batch_size])
-    zeros = array_ops.zeros(array_ops.stack(zeros_size), dtype=dtype)
-    zeros.set_shape(_state_size_with_prefix(state_size, prefix=[None]))
+  def get_state_shape(s):
+    """Combine s with batch_size to get a proper tensor shape."""
+    c = _concat(batch_size, s)
+    c_static = _concat(batch_size, s, static=True)
+    size = array_ops.zeros(c, dtype=dtype)
+    size.set_shape(c_static)
+    return size
+  return nest.map_structure(get_state_shape, state_size)
 
-  return zeros
 
-
-class _RNNCell(base_layer._Layer):  # pylint: disable=protected-access
+class _RNNCell(base_layer.Layer):
   """Abstract object representing an RNN cell.
 
-  Every `RNNCell` must have the properties below and implement `__call__` with
-  the following signature.
+  Every `RNNCell` must have the properties below and implement `call` with
+  the signature `(output, next_state) = call(input, state)`.  The optional
+  third input argument, `scope`, is allowed for backwards compatibility
+  purposes; but should be left off for new subclasses.
 
   This definition of cell differs from the definition used in the literature.
   In the literature, 'cell' refers to an object with a single scalar output.
@@ -90,8 +132,9 @@ class _RNNCell(base_layer._Layer):  # pylint: disable=protected-access
   This operation results in an output matrix with `self.output_size` columns.
   If `self.state_size` is an integer, this operation also results in a new
   state matrix with `self.state_size` columns.  If `self.state_size` is a
-  tuple of integers, then it results in a tuple of `len(state_size)` state
-  matrices, each with a column size corresponding to values in `state_size`.
+  (possibly nested tuple of) TensorShape object(s), then it should return a
+  matching structure of Tensors having shape `[batch_size].concatenate(s)`
+  for each `s` in `self.batch_size`.
   """
 
   def __call__(self, inputs, state, scope=None):
@@ -112,7 +155,25 @@ class _RNNCell(base_layer._Layer):  # pylint: disable=protected-access
       - New state: Either a single `2-D` tensor, or a tuple of tensors matching
         the arity and shapes of `state`.
     """
-    return super(_RNNCell, self).__call__(inputs, state, scope=scope)
+    if scope is not None:
+      with vs.variable_scope(scope,
+                             custom_getter=self._rnn_get_variable) as scope:
+        return super(_RNNCell, self).__call__(inputs, state, scope=scope)
+    else:
+      with vs.variable_scope(vs.get_variable_scope(),
+                             custom_getter=self._rnn_get_variable):
+        return super(_RNNCell, self).__call__(inputs, state)
+
+  def _rnn_get_variable(self, getter, *args, **kwargs):
+    variable = getter(*args, **kwargs)
+    trainable = (variable in tf_variables.trainable_variables() or
+                 (isinstance(variable, tf_variables.PartitionedVariable) and
+                  list(variable)[0] in tf_variables.trainable_variables()))
+    if trainable and variable not in self._trainable_weights:
+      self._trainable_weights.append(variable)
+    elif not trainable and variable not in self._non_trainable_weights:
+      self._non_trainable_weights.append(variable)
+    return variable
 
   @property
   def state_size(self):
@@ -128,6 +189,11 @@ class _RNNCell(base_layer._Layer):  # pylint: disable=protected-access
     """Integer or TensorShape: size of outputs produced by this cell."""
     raise NotImplementedError("Abstract method")
 
+  def build(self, _):
+    # This tells the parent Layer object that it's OK to call
+    # self.add_variable() inside the call() method.
+    pass
+
   def zero_state(self, batch_size, dtype):
     """Return zero-filled state tensor(s).
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index af7abf525114187f05152415a52fc3b6ba7bb81e..9286114277c4209b97f2b7684ac15d9ffd996240 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -51,6 +51,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -240,6 +241,8 @@ def sparse_add(a, b, thresh=0):
   of arguments does not matter.  Use vanilla `tf.add()` for adding two dense
   `Tensor`s.
 
+  The shapes of the two operands must match: broadcasting is not supported.
+
   The indices of any input `SparseTensor` are assumed ordered in standard
   lexicographic order.  If this is not the case, before this step run
   `SparseReorder` to restore index ordering.
@@ -288,12 +291,21 @@ def sparse_add(a, b, thresh=0):
 
   if all(isinstance(inp, sparse_classes) for inp in [a, b]):
     a = _convert_to_sparse_tensor(a)
+    b = _convert_to_sparse_tensor(b)
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype, name="thresh")
     output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add(
         a.indices, a.values, a.dense_shape,
         b.indices, b.values, b.dense_shape,
         thresh))
+
+    # Attempt to get output_shape statically.
+    a.get_shape().assert_is_compatible_with(b.get_shape())
+    static_shape = array_ops.broadcast_static_shape(
+        a.get_shape(), b.get_shape())
+    if static_shape.is_fully_defined():
+      output_shape = static_shape.as_list()
+
     return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
   else:
     # swap to make `a` the SparseTensor.
@@ -303,6 +315,126 @@ def sparse_add(a, b, thresh=0):
         a.indices, a.values, a.dense_shape, b)
 
 
+def _sparse_cross(inputs, name=None):
+  """Generates sparse cross from a list of sparse and dense tensors.
+
+  For example, if the inputs are
+  * inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+  * inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+  * inputs[2]: Tensor [["f"], ["g"]]
+
+  then the output will be:
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+  Args:
+    inputs: An iterable of `Tensor` or `SparseTensor`.
+    name: Optional name for the op.
+
+  Returns:
+    A `SparseTensor` of type `string`.
+  """
+  return _sparse_cross_internal(inputs=inputs, hashed_output=False, name=name)
+
+
+def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
+  """Generates hashed sparse cross from a list of sparse and dense tensors.
+
+  For example, if the inputs are
+  * inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+  * inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+  * inputs[2]: Tensor [["f"], ["g"]]
+
+  then the output will be:
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+
+  Args:
+    inputs: An iterable of `Tensor` or `SparseTensor`.
+    num_buckets: An `int` that is `>= 0`.
+      output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+    hash_key: Integer hash_key that will be used by the `FingerprintCat64`
+      function. If not given, will use a default key.
+    name: Optional name for the op.
+
+  Returns:
+    A `SparseTensor` of type `int64`.
+  """
+  return _sparse_cross_internal(
+      inputs=inputs,
+      hashed_output=True,
+      num_buckets=num_buckets,
+      hash_key=hash_key,
+      name=name)
+
+
+_DEFAULT_HASH_KEY = 0xDECAFCAFFE
+
+
+def _sparse_cross_internal(
+    inputs, hashed_output=False, num_buckets=0, hash_key=None, name=None):
+  """See gen_sparse_ops._sparse_cross."""
+  if not isinstance(inputs, list):
+    raise TypeError("Inputs must be a list")
+  if not all(isinstance(i, sparse_tensor.SparseTensor) or
+             isinstance(i, ops.Tensor) for i in inputs):
+    raise TypeError("All inputs must be SparseTensors")
+
+  sparse_inputs = [i for i in inputs
+                   if isinstance(i, sparse_tensor.SparseTensor)]
+  dense_inputs = [i for i in inputs
+                  if not isinstance(i, sparse_tensor.SparseTensor)]
+
+  indices = [sp_input.indices for sp_input in sparse_inputs]
+  values = [sp_input.values for sp_input in sparse_inputs]
+  shapes = [sp_input.dense_shape for sp_input in sparse_inputs]
+  out_type = dtypes.int64 if hashed_output else dtypes.string
+
+  internal_type = dtypes.string
+  for i in range(len(values)):
+    if values[i].dtype != dtypes.string:
+      values[i] = math_ops.to_int64(values[i])
+      internal_type = dtypes.int64
+  for i in range(len(dense_inputs)):
+    if dense_inputs[i].dtype != dtypes.string:
+      dense_inputs[i] = math_ops.to_int64(dense_inputs[i])
+      internal_type = dtypes.int64
+
+  indices_out, values_out, shape_out = gen_sparse_ops._sparse_cross(
+      indices=indices,
+      values=values,
+      shapes=shapes,
+      dense_inputs=dense_inputs,
+      hashed_output=hashed_output,
+      num_buckets=num_buckets,
+      hash_key=hash_key or _DEFAULT_HASH_KEY,
+      out_type=out_type,
+      internal_type=internal_type,
+      name=name)
+
+  return sparse_tensor.SparseTensor(indices_out, values_out, shape_out)
+
+
 def sparse_dense_cwise_add(sp_t, dense_t):
   """Adds up a SparseTensor and a dense Tensor, using these special rules:
 
@@ -368,8 +500,12 @@ def sparse_reorder(sp_input, name=None):
   reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder(
       sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
-  return sparse_tensor.SparseTensor(reordered_ind, reordered_val,
-                                    array_ops.identity(sp_input.dense_shape))
+  if sp_input.get_shape().is_fully_defined():
+    dense_shape = sp_input.get_shape().as_list()
+  else:
+    dense_shape = array_ops.identity(sp_input.dense_shape)
+
+  return sparse_tensor.SparseTensor(reordered_ind, reordered_val, dense_shape)
 
 
 def sparse_reshape(sp_input, shape, name=None):
@@ -416,13 +552,30 @@ def sparse_reshape(sp_input, shape, name=None):
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
+    ValueError:  If argument `shape` requests a `SparseTensor` with a different
+      number of elements than `sp_input`.
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
+  shape = math_ops.cast(shape, dtype=dtypes.int64)
 
   with ops.name_scope(name, "SparseReshape", [sp_input]) as name:
     reshaped_ind, reshaped_shape = gen_sparse_ops._sparse_reshape(
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
+    reshaped_shape_const = tensor_util.constant_value(shape)
+    if (reshaped_shape_const is not None
+        and sp_input.get_shape().is_fully_defined()):
+      # Don't deal with inferred dimensions. That would add significant code.
+      if all(n >= 0 for n in reshaped_shape_const):
+        reshaped_size = np.prod(reshaped_shape_const)
+        in_shape_size = np.prod(sp_input.get_shape().as_list())
+        if reshaped_size != in_shape_size:
+          raise ValueError(
+              "Cannot reshape a tensor with %d elements to shape %s "
+              "(%d elements)."
+              % (in_shape_size, reshaped_shape_const, reshaped_size))
+        reshaped_shape = reshaped_shape_const
+
     return sparse_tensor.SparseTensor(
         reshaped_ind, array_ops.identity(sp_input.values),
         reshaped_shape)
@@ -986,6 +1139,8 @@ def sparse_reset_shape(sp_input, new_shape=None):
     TypeError: If `sp_input` is not a `SparseTensor`.
     ValueError: If `new_shape` represents a tensor with a different rank from
       that of `sp_input` (if shapes are known when graph is constructed).
+    ValueError:  If `new_shape` is determined during graph build to have
+      dimension sizes that are too small.
     OpError:
       - If `new_shape` has dimension sizes that are too small.
       - If shapes are not known during graph construction time, and during run
@@ -1009,14 +1164,27 @@ def sparse_reset_shape(sp_input, new_shape=None):
     # error before the sparse_tensor.SparseTensor catches it.
     output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
 
-    # For cases where shape is not known during graph construction.
-    output_shape_tensor = control_flow_ops.with_dependencies(
-        [check_ops.assert_equal(
-            array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
-        output_shape_tensor)
-    output_shape_tensor = control_flow_ops.with_dependencies(
-        [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
+    output_shape_tensor_const = tensor_util.constant_value(
         output_shape_tensor)
+    # For cases where all shapes are known during graph construction
+    if (output_shape_tensor_const is not None
+        and sp_input.get_shape().is_fully_defined()):
+      in_shape_const = np.array(sp_input.get_shape().as_list())
+      if not np.all(in_shape_const <= output_shape_tensor_const):
+        raise ValueError(
+            "Requested new_shape should have dimension sizes >= sp_input.shape."
+            "  Found new_shape (%s), sp_input.shape (%s)."
+            % (in_shape_const, output_shape_tensor_const))
+      output_shape_tensor = output_shape_tensor_const
+    else:
+      # For cases where shape is not known during graph construction.
+      output_shape_tensor = control_flow_ops.with_dependencies(
+          [check_ops.assert_equal(
+              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
+          output_shape_tensor)
+      output_shape_tensor = control_flow_ops.with_dependencies(
+          [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
+          output_shape_tensor)
 
   return sparse_tensor.SparseTensor(in_indices, in_values, output_shape_tensor)
 
@@ -1239,7 +1407,45 @@ def sparse_tensor_dense_matmul(sp_a,
     A should be sorted in order of increasing dimension 1 (i.e., "column major"
     order instead of "row major" order).
 
-  Deciding when to use sparse_tensor_dense_matmul vs. matmul(a_is_sparse=True):
+  Using `tf.nn.embedding_lookup_sparse` for sparse multiplication:
+
+  It's not obvious but you can consider `embedding_lookup_sparse` as another
+  sparse and dense multiplication. In some situations, you may prefer to use
+  `embedding_lookup_sparse` even though you're not dealing with embeddings.
+
+  There are two questions to ask in the decision process: Do you need gradients
+  computed as sparse too? Is your sparse data represented as two
+  `SparseTensor`s: ids and values? There is more explanation about data format
+  below. If you answer any of these questions as yes, consider using
+  `tf.nn.embedding_lookup_sparse`.
+
+  Following explains differences between the expected SparseTensors:
+  For example if dense form of your sparse data has shape `[3, 5]` and values:
+
+      [[  a      ]
+       [b       c]
+       [    d    ]]
+
+
+  `SparseTensor` format expected by `sparse_tensor_dense_matmul`:
+   `sp_a` (indices, values):
+
+      [0, 1]: a
+      [1, 0]: b
+      [1, 4]: c
+      [2, 2]: d
+
+  `SparseTensor` format expected by `embedding_lookup_sparse`:
+   `sp_ids`                 `sp_weights`
+
+      [0, 0]: 1                [0, 0]: a
+      [1, 0]: 0                [1, 0]: b
+      [1, 1]: 4                [1, 1]: c
+      [2, 0]: 2                [2, 0]: d
+
+
+  Deciding when to use `sparse_tensor_dense_matmul` vs.
+  `matmul`(a_is_sparse=True):
 
   There are a number of questions to ask in the decision process, including:
 
@@ -1255,10 +1461,10 @@ def sparse_tensor_dense_matmul(sp_a,
   of the product is small (e.g. matrix-vector multiplication), if
   `sp_a.dense_shape` takes on large values.
 
-  Below is a rough speed comparison between sparse_tensor_dense_matmul,
-  labelled 'sparse', and matmul(a_is_sparse=True), labelled 'dense'.  For purposes of
-  the comparison, the time spent converting from a SparseTensor to a dense
-  Tensor is not included, so it is overly conservative with respect to
+  Below is a rough speed comparison between `sparse_tensor_dense_matmul`,
+  labelled 'sparse', and `matmul`(a_is_sparse=True), labelled 'dense'.  For
+  purposes of the comparison, the time spent converting from a `SparseTensor` to
+  a dense `Tensor` is not included, so it is overly conservative with respect to
   the time ratio.
 
   Benchmark system:
@@ -1560,7 +1766,7 @@ def sparse_transpose(sp_input, perm=None, name=None):
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
-  with ops.op_scope([sp_input], name, "SparseTranspose") as name:
+  with ops.name_scope(name, "SparseTranspose", [sp_input]) as name:
     if perm is None:
       rank = array_ops.rank(sp_input)
       perm = (rank - 1) - math_ops.range(0, rank, 1)
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index e24246464ec85ebd0ade33ba5bc81a401787535f..851fba0beba3df643844881a5b10766b82f4f730 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -35,19 +35,20 @@ from tensorflow.python.platform import tf_logging as logging
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
 def lbeta(x, name='lbeta'):
-  r"""Computes `ln(|Beta(x)|)`, reducing along the last dimension.
+  r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
 
   Given one-dimensional `z = [z_0,...,z_{K-1}]`, we define
 
-  ```Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)```
+  $$Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)$$
 
   And for `n + 1` dimensional `x` with shape `[N1, ..., Nn, K]`, we define
-  `lbeta(x)[i1, ..., in] = Log(|Beta(x[i1, ..., in, :])|)`.  In other words,
-  the last dimension is treated as the `z` vector.
+  $$lbeta(x)[i1, ..., in] = Log(|Beta(x[i1, ..., in, :])|)$$.
+
+  In other words, the last dimension is treated as the `z` vector.
 
   Note that if `z = [u, v]`, then
-  `Beta(z) = int_0^1 t^{u-1} (1 - t)^{v-1} dt`, which defines the traditional
-  bivariate beta function.
+  \\(Beta(z) = int_0^1 t^{u-1} (1 - t)^{v-1} dt\\), which defines the
+  traditional bivariate beta function.
 
   If the last dimension is empty, we follow the convention that the sum over
   the empty set is zero, and the product is one.
@@ -57,7 +58,7 @@ def lbeta(x, name='lbeta'):
     name: A name for the operation (optional).
 
   Returns:
-    The logarithm of `|Beta(x)|` reducing along the last dimension.
+    The logarithm of \\(|Beta(x)|\\) reducing along the last dimension.
   """
   # In the event that the last dimension has zero entries, we return -inf.
   # This is consistent with a convention that the sum over the empty set 0, and
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 09e04d4247c7f169f954ee2cdaf6568f63881419..a6b14f6f6f35a497f20908868a9ef5f2dfeef48e 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -57,6 +57,8 @@ from tensorflow.python.ops.io_ops import *
 from tensorflow.python.ops.linalg_ops import *
 from tensorflow.python.ops.logging_ops import Print
 from tensorflow.python.ops.logging_ops import get_summary_op
+from tensorflow.python.ops.lookup_ops import initialize_all_tables
+from tensorflow.python.ops.lookup_ops import tables_initializer
 from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index f46f56cbb712dec942a62637d413b08146820f00..63394d52145209c9d409d7b5f85fec58039c26bb 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -209,7 +209,7 @@ def assign_sub(ref, value, use_locking=None, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.assign_sub(
         ref, value, use_locking=use_locking, name=name)
-  return ref.assign_sub(value, name=name)
+  return ref.assign_sub(value)
 
 
 def assign_add(ref, value, use_locking=None, name=None):
@@ -237,7 +237,7 @@ def assign_add(ref, value, use_locking=None, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.assign_add(
         ref, value, use_locking=use_locking, name=name)
-  return ref.assign_add(value, name=name)
+  return ref.assign_add(value)
 
 
 def assign(ref, value, validate_shape=None, use_locking=None, name=None):
@@ -269,4 +269,4 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None):
     return gen_state_ops.assign(
         ref, value, use_locking=use_locking, name=name,
         validate_shape=validate_shape)
-  return ref.assign(value, name=name)
+  return ref.assign(value)
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index d1013c53ddfa2c936eaf780e5e135fb979edbdc1..b1c7d74a0cb98e626ea66bc54e6eb3678cc6fc19 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import tf_should_use
 
 
 def _maybe_set_device(handle_op, value_t):
@@ -252,6 +253,7 @@ class TensorArray(object):
         value.set_shape(self._element_shape[0].dims)
       return value
 
+  @tf_should_use.should_use_result
   def write(self, index, value, name=None):
     """Write `value` into index `index` of the TensorArray.
 
@@ -358,6 +360,7 @@ class TensorArray(object):
         value.set_shape([None] + self._element_shape[0].dims[1:])
       return value
 
+  @tf_should_use.should_use_result
   def unstack(self, value, name=None):
     """Unstack the values of a `Tensor` in the TensorArray.
 
@@ -380,6 +383,7 @@ class TensorArray(object):
       return self.scatter(
           indices=math_ops.range(0, num_elements), value=value, name=name)
 
+  @tf_should_use.should_use_result
   def scatter(self, indices, value, name=None):
     """Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
 
@@ -418,6 +422,7 @@ class TensorArray(object):
         ta._merge_element_shape(element_shape)
       return ta
 
+  @tf_should_use.should_use_result
   def split(self, value, lengths, name=None):
     """Split the values of a `Tensor` into the TensorArray.
 
@@ -466,6 +471,7 @@ class TensorArray(object):
       return gen_data_flow_ops._tensor_array_size_v3(
           handle=self._handle, flow_in=self.flow, name=name)
 
+  @tf_should_use.should_use_result
   def close(self, name=None):
     """Close the current TensorArray."""
     with ops.colocate_with(self._handle):
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index f81837b73aca56e7abd85ecf6ec02505fd4079a1..a29ddfa9f2f1af2f903c27c6a4acdbafa0ab641f 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -280,6 +280,17 @@ class _VariableStore(object):
       raise ValueError(
           "Passed a custom_getter which is not callable: %s" % custom_getter)
 
+    # If a *_ref type is passed in an error would be triggered further down the
+    # stack. We prevent this using base_dtype to get a non-ref version of the
+    # type, before doing anything else. When _ref types are removed in favour of
+    # resources, this line can be removed.
+    try:
+      dtype = dtype.base_dtype
+    except AttributeError:
+      # .base_dtype not existing means that we will try and use the raw dtype
+      # which was passed in - this might be a NumPy type which is valid.
+      pass
+
     # This is the main logic of get_variable.  However, custom_getter
     # may override this logic.  So we save it as a callable and pass
     # it to custom_getter.
@@ -1281,7 +1292,7 @@ def _pure_variable_scope(name_or_scope,
       well-defined semantics. Defaults to False (will later change to True).
 
   Yields:
-    A scope that can be to captured and reused.
+    A scope that can be captured and reused.
 
   Raises:
     ValueError: when trying to reuse within a create scope, or create within
@@ -1487,6 +1498,9 @@ def variable_scope(name_or_scope,
   Note that the `reuse` flag is inherited: if we open a reusing scope,
   then all its sub-scopes become reusing as well.
 
+  A note about name scoping: Setting `reuse` does not impact the naming of other
+  ops such as mult. See related discussion on [github#6189](https://github.com/tensorflow/tensorflow/issues/6189)
+
   Args:
     name_or_scope: `string` or `VariableScope`: the scope to open.
     default_name: The default name to use if the `name_or_scope` argument is
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 8b508e45a40822cbd0ffc35bca343b86752a62b4..d5ffd9ac3330dc9ca20fd10ed61a4fb44e13fceb 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -24,9 +24,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
 
 
@@ -568,6 +570,29 @@ class Variable(object):
         sparse_delta.values,
         use_locking=use_locking)
 
+  def _strided_slice_assign(self,
+                            begin,
+                            end,
+                            strides,
+                            value,
+                            name,
+                            begin_mask,
+                            end_mask,
+                            ellipsis_mask,
+                            new_axis_mask,
+                            shrink_axis_mask):
+    return gen_array_ops.strided_slice_assign(ref=self._ref(),
+                                              begin=begin,
+                                              end=end,
+                                              strides=strides,
+                                              value=value,
+                                              name=name,
+                                              begin_mask=begin_mask,
+                                              end_mask=end_mask,
+                                              ellipsis_mask=ellipsis_mask,
+                                              new_axis_mask=new_axis_mask,
+                                              shrink_axis_mask=shrink_axis_mask)
+
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
 
@@ -1152,6 +1177,7 @@ def variables_initializer(var_list, name="init"):
   return control_flow_ops.no_op(name=name)
 
 
+@tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.variables_initializer` instead.")
 def initialize_variables(var_list, name="init"):
   """See `tf.variables_initializer`."""
@@ -1169,6 +1195,7 @@ def global_variables_initializer():
   return variables_initializer(global_variables())
 
 
+@tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.global_variables_initializer` instead.")
 def initialize_all_variables():
   """See `tf.global_variables_initializer`."""
@@ -1186,12 +1213,14 @@ def local_variables_initializer():
   return variables_initializer(local_variables())
 
 
+@tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.local_variables_initializer` instead.")
 def initialize_local_variables():
   """See `tf.local_variables_initializer`."""
   return local_variables_initializer()
 
 
+@tf_should_use.should_use_result
 def is_variable_initialized(variable):
   """Tests if a variable has been initialized.
 
@@ -1205,6 +1234,7 @@ def is_variable_initialized(variable):
   return state_ops.is_variable_initialized(variable)
 
 
+@tf_should_use.should_use_result
 def assert_variables_initialized(var_list=None):
   """Returns an Op to check if variables are initialized.
 
@@ -1246,6 +1276,7 @@ def assert_variables_initialized(var_list=None):
       return array_ops.stack(ranks)
 
 
+@tf_should_use.should_use_result
 def report_uninitialized_variables(var_list=None,
                                    name="report_uninitialized_variables"):
   """Adds ops to list the names of uninitialized variables.
diff --git a/tensorflow/python/ops/weights_broadcast_ops.py b/tensorflow/python/ops/weights_broadcast_ops.py
index 257b9f1faa4c2c7bd492c882bc2bdcb66b9d22d0..35e93249c31b7446be387bf165284fe54fcaa8e0 100644
--- a/tensorflow/python/ops/weights_broadcast_ops.py
+++ b/tensorflow/python/ops/weights_broadcast_ops.py
@@ -97,9 +97,10 @@ def assert_broadcastable(weights, values):
         return control_flow_ops.no_op(name="static_scalar_check_success")
       if weights_rank_static != values_rank_static:
         raise ValueError(
-            "%s values.rank=%s. weights.rank=%s." % (
+            "%s values.rank=%s. weights.rank=%s."
+            " values.shape=%s. weights.shape=%s." % (
                 _ASSERT_BROADCASTABLE_ERROR_PREFIX, values_rank_static,
-                weights_rank_static))
+                weights_rank_static, values.shape, weights.shape))
       weights_shape_static = tensor_util.constant_value(weights_shape)
       values_shape_static = tensor_util.constant_value(values_shape)
       if weights_shape_static is not None and values_shape_static is not None:
diff --git a/tensorflow/python/saved_model/README.md b/tensorflow/python/saved_model/README.md
index 8422fb6404b556b79777139eb517f9b13d8f92cb..f19127ecd54af161a7bd20033d9e253ae0028c10 100644
--- a/tensorflow/python/saved_model/README.md
+++ b/tensorflow/python/saved_model/README.md
@@ -102,7 +102,7 @@ The typical usage of `builder` is as follows:
 ~~~python
 export_dir = ...
 ...
-builder = saved_model_builder.SavedModelBuilder(export_dir)
+builder = tf.saved_model_builder.SavedModelBuilder(export_dir)
 with tf.Session(graph=tf.Graph()) as sess:
   ...
   builder.add_meta_graph_and_variables(sess,
@@ -132,7 +132,7 @@ the specific meta graph def, will be restored into the supplied session.
 export_dir = ...
 ...
 with tf.Session(graph=tf.Graph()) as sess:
-  loader.load(sess, [tag_constants.TRAINING], export_dir)
+  tf.saved_model.loader.load(sess, [tag_constants.TRAINING], export_dir)
   ...
 ~~~
 
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index d075a04ca2a49d9b2692490eaf4729da10b0fe0d..6899cb10a985080ad79f4574c8a8b78b57ed2038 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -57,7 +57,7 @@ class SavedModelBuilder(object):
   Typical usage for the `SavedModelBuilder`:
   ```python
   ...
-  builder = saved_model.builder.SavedModelBuilder(export_dir)
+  builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
 
   with tf.Session(graph=tf.Graph()) as sess:
     ...
@@ -96,53 +96,13 @@ class SavedModelBuilder(object):
     # weights.
     self._has_saved_variables = False
 
-  def _asset_path_from_tensor(self, path_tensor):
-    """Returns the filepath value stored in constant `path_tensor`.
-
-    Args:
-      path_tensor: Tensor of a file-path.
-
-    Returns:
-      The string value i.e. path of the tensor, if valid.
-
-    Raises:
-      TypeError if tensor does not match expected op type, dtype or value.
-    """
-    if not isinstance(path_tensor, ops.Tensor):
-      raise TypeError("Asset path tensor must be a Tensor.")
-    if path_tensor.op.type != "Const":
-      raise TypeError("Asset path tensor must be of type constant.")
-    if path_tensor.dtype != dtypes.string:
-      raise TypeError("Asset path tensor must be of dtype string.")
-    str_values = path_tensor.op.get_attr("value").string_val
-    if len(str_values) != 1:
-      raise TypeError("Asset path tensor must be a scalar.")
-    return str_values[0]
-
-  def _add_asset_to_collection(self, asset_filename, asset_tensor):
-    """Builds an asset proto and adds it to the asset collection of the graph.
-
-    Args:
-      asset_filename: The filename of the asset to be added.
-      asset_tensor: The asset tensor used to populate the tensor info of the
-          asset proto.
-    """
-    asset_proto = meta_graph_pb2.AssetFileDef()
-    asset_proto.filename = asset_filename
-    asset_proto.tensor_info.name = asset_tensor.name
-
-    asset_any_proto = Any()
-    asset_any_proto.Pack(asset_proto)
-    ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
-
   def _save_and_write_assets(self, assets_collection_to_add=None):
     """Saves asset to the meta graph and writes asset files to disk.
 
     Args:
       assets_collection_to_add: The collection where the asset paths are setup.
     """
-    asset_source_filepath_list = self._maybe_save_assets(
-        assets_collection_to_add)
+    asset_source_filepath_list = _maybe_save_assets(assets_collection_to_add)
 
     # Return if there are no assets to write.
     if len(asset_source_filepath_list) is 0:
@@ -201,42 +161,6 @@ class SavedModelBuilder(object):
         raise TypeError("main_op needs to be an Operation: %r" % main_op)
       ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
-  def _maybe_save_assets(self, assets_collection_to_add=None):
-    """Saves assets to the meta graph.
-
-    Args:
-      assets_collection_to_add: The collection where the asset paths are setup.
-
-    Returns:
-      The list of filepaths to the assets in the assets collection.
-
-    Raises:
-      ValueError: Indicating an invalid filepath tensor.
-    """
-    asset_source_filepath_list = []
-
-    if assets_collection_to_add is None:
-      tf_logging.info("No assets to save.")
-      return asset_source_filepath_list
-
-    # Iterate over the supplied asset collection, build the `AssetFile` proto
-    # and add them to the collection with key `constants.ASSETS_KEY`, in the
-    # graph.
-    for asset_tensor in assets_collection_to_add:
-      asset_source_filepath = self._asset_path_from_tensor(asset_tensor)
-      if not asset_source_filepath:
-        raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
-
-      asset_source_filename = os.path.basename(asset_source_filepath)
-
-      # Build `AssetFile` proto and add it to the asset collection in the graph.
-      self._add_asset_to_collection(asset_source_filename, asset_tensor)
-
-      asset_source_filepath_list.append(asset_source_filepath)
-
-    tf_logging.info("Assets added to graph.")
-    return asset_source_filepath_list
-
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
 
@@ -475,3 +399,81 @@ class SavedModelBuilder(object):
     tf_logging.info("SavedModel written to: %s", path)
 
     return path
+
+
+def _maybe_save_assets(assets_collection_to_add=None):
+  """Saves assets to the meta graph.
+
+  Args:
+    assets_collection_to_add: The collection where the asset paths are setup.
+
+  Returns:
+    The list of filepaths to the assets in the assets collection.
+
+  Raises:
+    ValueError: Indicating an invalid filepath tensor.
+  """
+  asset_source_filepath_list = []
+
+  if assets_collection_to_add is None:
+    tf_logging.info("No assets to save.")
+    return asset_source_filepath_list
+
+  # Iterate over the supplied asset collection, build the `AssetFile` proto
+  # and add them to the collection with key `constants.ASSETS_KEY`, in the
+  # graph.
+  for asset_tensor in assets_collection_to_add:
+    asset_source_filepath = _asset_path_from_tensor(asset_tensor)
+    if not asset_source_filepath:
+      raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
+
+    asset_source_filename = os.path.basename(asset_source_filepath)
+
+    # Build `AssetFile` proto and add it to the asset collection in the graph.
+    _add_asset_to_collection(asset_source_filename, asset_tensor)
+
+    asset_source_filepath_list.append(asset_source_filepath)
+
+  tf_logging.info("Assets added to graph.")
+  return asset_source_filepath_list
+
+
+def _asset_path_from_tensor(path_tensor):
+  """Returns the filepath value stored in constant `path_tensor`.
+
+  Args:
+    path_tensor: Tensor of a file-path.
+
+  Returns:
+    The string value i.e. path of the tensor, if valid.
+
+  Raises:
+    TypeError if tensor does not match expected op type, dtype or value.
+  """
+  if not isinstance(path_tensor, ops.Tensor):
+    raise TypeError("Asset path tensor must be a Tensor.")
+  if path_tensor.op.type != "Const":
+    raise TypeError("Asset path tensor must be of type constant.")
+  if path_tensor.dtype != dtypes.string:
+    raise TypeError("Asset path tensor must be of dtype string.")
+  str_values = path_tensor.op.get_attr("value").string_val
+  if len(str_values) != 1:
+    raise TypeError("Asset path tensor must be a scalar.")
+  return str_values[0]
+
+
+def _add_asset_to_collection(asset_filename, asset_tensor):
+  """Builds an asset proto and adds it to the asset collection of the graph.
+
+  Args:
+    asset_filename: The filename of the asset to be added.
+    asset_tensor: The asset tensor used to populate the tensor info of the
+        asset proto.
+  """
+  asset_proto = meta_graph_pb2.AssetFileDef()
+  asset_proto.filename = asset_filename
+  asset_proto.tensor_info.name = asset_tensor.name
+
+  asset_any_proto = Any()
+  asset_any_proto.Pack(asset_proto)
+  ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index a9d999dad3a55b45d58395f7be535d236deaecc9..32526521749d26c02e29f8bcda7b934faecfddfe 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -195,46 +195,47 @@ def load(sess, tags, export_dir, **saver_kwargs):
   Raises:
     RuntimeError: MetaGraphDef associated with the tags cannot be found.
   """
-  # Build the SavedModel protocol buffer and find the requested meta graph def.
-  saved_model = _parse_saved_model(export_dir)
-  found_match = False
-  for meta_graph_def in saved_model.meta_graphs:
-    if set(meta_graph_def.meta_info_def.tags) == set(tags):
-      meta_graph_def_to_load = meta_graph_def
-      found_match = True
-      break
-
-  if not found_match:
-    raise RuntimeError("MetaGraphDef associated with tags " + str(tags).strip(
-        "[]") + " could not be found in SavedModel")
-
-  # Build a saver by importing the meta graph def to load.
-  saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
-
-  if saver:
-    # Build the checkpoint path where the variables are located.
-    variables_path = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes(constants.VARIABLES_DIRECTORY),
-        compat.as_bytes(constants.VARIABLES_FILENAME))
-
-    # Restore the variables using the built saver in the provided session.
-    saver.restore(sess, variables_path)
-  else:
-    tf_logging.info("The specified SavedModel has no variables; no "
-                    "checkpoints were restored.")
-
-  # Get asset tensors, if any.
-  asset_tensors_dictionary = _get_asset_tensors(export_dir,
-                                                meta_graph_def_to_load)
-
-  main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
-  if main_op_tensor is not None:
-    sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
-  else:
-    legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
-    if legacy_init_op_tensor is not None:
-      sess.run(fetches=[legacy_init_op_tensor],
-               feed_dict=asset_tensors_dictionary)
-
-  return meta_graph_def_to_load
+  with sess.graph.as_default():
+    # Build the SavedModel protocol buffer and find requested meta graph def.
+    saved_model = _parse_saved_model(export_dir)
+    found_match = False
+    for meta_graph_def in saved_model.meta_graphs:
+      if set(meta_graph_def.meta_info_def.tags) == set(tags):
+        meta_graph_def_to_load = meta_graph_def
+        found_match = True
+        break
+
+    if not found_match:
+      raise RuntimeError("MetaGraphDef associated with tags " + str(tags).strip(
+          "[]") + " could not be found in SavedModel")
+
+    # Build a saver by importing the meta graph def to load.
+    saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
+
+    if saver:
+      # Build the checkpoint path where the variables are located.
+      variables_path = os.path.join(
+          compat.as_bytes(export_dir),
+          compat.as_bytes(constants.VARIABLES_DIRECTORY),
+          compat.as_bytes(constants.VARIABLES_FILENAME))
+
+      # Restore the variables using the built saver in the provided session.
+      saver.restore(sess, variables_path)
+    else:
+      tf_logging.info("The specified SavedModel has no variables; no "
+                      "checkpoints were restored.")
+
+    # Get asset tensors, if any.
+    asset_tensors_dictionary = _get_asset_tensors(export_dir,
+                                                  meta_graph_def_to_load)
+
+    main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
+    if main_op_tensor is not None:
+      sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
+    else:
+      legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
+      if legacy_init_op_tensor is not None:
+        sess.run(
+            fetches=[legacy_init_op_tensor], feed_dict=asset_tensors_dictionary)
+
+    return meta_graph_def_to_load
diff --git a/tensorflow/python/saved_model/main_op_impl.py b/tensorflow/python/saved_model/main_op_impl.py
index 66cf9d4d8af53b2d22f14d54ab054bcfa49df967..355fd57bf1d2166f58a5fdc95d04695ea05b56b3 100644
--- a/tensorflow/python/saved_model/main_op_impl.py
+++ b/tensorflow/python/saved_model/main_op_impl.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops as tf_data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 
 
@@ -35,7 +35,7 @@ def main_op():
   """
   init = variables.global_variables_initializer()
   init_local = variables.local_variables_initializer()
-  init_tables = tf_data_flow_ops.tables_initializer()
+  init_tables = lookup_ops.tables_initializer()
   return control_flow_ops.group(init, init_local, init_tables)
 
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index a81f74417529eb84763656ae5407bbcfe4a9e077..fcd6bc39547066617be14b8f9e70127dd7fdadab 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -151,6 +151,27 @@ class SavedModelTest(test.TestCase):
                                    constants.SAVED_MODEL_FILENAME_PBTXT):
         loader.load(sess, ["foo"], export_dir)
 
+  def testVerifySessionGraphUsage(self):
+    export_dir = os.path.join(test.get_temp_dir(),
+                              "test_verify_session_graph_usage")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    # Build a session and supply it to the load operation.
+    sess = session.Session(graph=ops.Graph())
+    loader.load(sess, [tag_constants.TRAINING], export_dir)
+
+    # Check the variable within the scope of the session and its graph.
+    with sess:
+      self.assertEqual(
+          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+
   def testSequence(self):
     export_dir = os.path.join(test.get_temp_dir(), "test_sequence")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index d130588fa2978156a9e4b5aacc5a3aa91fad7bd1..efcc59465a6a23c8e2a0008a6507e9a6d4a46aa3 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -190,6 +190,11 @@ def histogram(name, values, collections=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
 
+  Adding a histogram summary makes it possible to visualize your data's
+  distribution in TensorBoard. You can see a detailed explanation of the
+  TensorBoard histogram dashboard
+  [here](https://www.tensorflow.org/get_started/tensorboard_histograms).
+
   The generated
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
   has one summary value containing a histogram for `values`.
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 38e3e17a88424735e19e6adf5c7e1e0a1bec3ca3..bd046a7fd099c71518e694c7a44c62616c960178 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Converts checkpoint variables into Const ops in a standalone GraphDef file.
+r"""Converts checkpoint variables into Const ops in a standalone GraphDef file.
 
 This script is designed to take a GraphDef proto, a SaverDef proto, and a set of
 variable values stored in a checkpoint file, and output a GraphDef with all of
@@ -55,29 +55,20 @@ from tensorflow.python.training import saver as saver_lib
 FLAGS = None
 
 
-def freeze_graph(input_graph,
-                 input_saver,
-                 input_binary,
-                 input_checkpoint,
-                 output_node_names,
-                 restore_op_name,
-                 filename_tensor_name,
-                 output_graph,
-                 clear_devices,
-                 initializer_nodes,
-                 variable_names_blacklist=""):
+def freeze_graph_with_def_protos(
+    input_graph_def,
+    input_saver_def,
+    input_checkpoint,
+    output_node_names,
+    restore_op_name,
+    filename_tensor_name,
+    output_graph,
+    clear_devices,
+    initializer_nodes,
+    variable_names_blacklist=""):
   """Converts all variables in a graph and checkpoint into constants."""
-
   del restore_op_name, filename_tensor_name  # Unused by updated loading code.
 
-  if not gfile.Exists(input_graph):
-    print("Input graph file '" + input_graph + "' does not exist!")
-    return -1
-
-  if input_saver and not gfile.Exists(input_saver):
-    print("Input saver file '" + input_saver + "' does not exist!")
-    return -1
-
   # 'input_checkpoint' may be a prefix if we're using Saver V2 format
   if not saver_lib.checkpoint_exists(input_checkpoint):
     print("Input checkpoint '" + input_checkpoint + "' doesn't exist!")
@@ -87,13 +78,6 @@ def freeze_graph(input_graph,
     print("You need to supply the name of a node to --output_node_names.")
     return -1
 
-  input_graph_def = graph_pb2.GraphDef()
-  mode = "rb" if input_binary else "r"
-  with gfile.FastGFile(input_graph, mode) as f:
-    if input_binary:
-      input_graph_def.ParseFromString(f.read())
-    else:
-      text_format.Merge(f.read(), input_graph_def)
   # Remove all the explicit device specifications for this node. This helps to
   # make the graph more portable.
   if clear_devices:
@@ -103,15 +87,9 @@ def freeze_graph(input_graph,
   _ = importer.import_graph_def(input_graph_def, name="")
 
   with session.Session() as sess:
-    if input_saver:
-      with gfile.FastGFile(input_saver, mode) as f:
-        saver_def = saver_pb2.SaverDef()
-        if input_binary:
-          saver_def.ParseFromString(f.read())
-        else:
-          text_format.Merge(f.read(), saver_def)
-        saver = saver_lib.Saver(saver_def=saver_def)
-        saver.restore(sess, input_checkpoint)
+    if input_saver_def:
+      saver = saver_lib.Saver(saver_def=input_saver_def)
+      saver.restore(sess, input_checkpoint)
     else:
       var_list = {}
       reader = pywrap_tensorflow.NewCheckpointReader(input_checkpoint)
@@ -142,6 +120,65 @@ def freeze_graph(input_graph,
   print("%d ops in the final graph." % len(output_graph_def.node))
 
 
+def _parse_input_graph_proto(input_graph, input_binary):
+  """Parser input tensorflow graph into GraphDef proto."""
+  if not gfile.Exists(input_graph):
+    print("Input graph file '" + input_graph + "' does not exist!")
+    return -1
+  input_graph_def = graph_pb2.GraphDef()
+  mode = "rb" if input_binary else "r"
+  with gfile.FastGFile(input_graph, mode) as f:
+    if input_binary:
+      input_graph_def.ParseFromString(f.read())
+    else:
+      text_format.Merge(f.read(), input_graph_def)
+  return input_graph_def
+
+
+def _parse_input_saver_proto(input_saver, input_binary):
+  """Parser input tensorflow Saver into SaverDef proto."""
+  if not gfile.Exists(input_saver):
+    print("Input saver file '" + input_saver + "' does not exist!")
+    return -1
+  mode = "rb" if input_binary else "r"
+  with gfile.FastGFile(input_saver, mode) as f:
+    saver_def = saver_pb2.SaverDef()
+    if input_binary:
+      saver_def.ParseFromString(f.read())
+    else:
+      text_format.Merge(f.read(), saver_def)
+  return saver_def
+
+
+def freeze_graph(input_graph,
+                 input_saver,
+                 input_binary,
+                 input_checkpoint,
+                 output_node_names,
+                 restore_op_name,
+                 filename_tensor_name,
+                 output_graph,
+                 clear_devices,
+                 initializer_nodes,
+                 variable_names_blacklist=""):
+  """Converts all variables in a graph and checkpoint into constants."""
+  input_graph_def = _parse_input_graph_proto(input_graph, input_binary)
+  input_saver_def = None
+  if input_saver:
+    input_saver_def = _parse_input_saver_proto(input_saver, input_binary)
+  freeze_graph_with_def_protos(
+      input_graph_def,
+      input_saver_def,
+      input_checkpoint,
+      output_node_names,
+      restore_op_name,
+      filename_tensor_name,
+      output_graph,
+      clear_devices,
+      initializer_nodes,
+      variable_names_blacklist)
+
+
 def main(unused_args):
   freeze_graph(FLAGS.input_graph, FLAGS.input_saver, FLAGS.input_binary,
                FLAGS.input_checkpoint, FLAGS.output_node_names,
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..caeb04a24bf9795d322f5703893f67b90fec6cab
--- /dev/null
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ================================
+"""Imports a protobuf model as a graph in Tensorboard."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.summary import summary
+
+
+def import_to_tensorboard(model_dir, log_dir):
+  """View an imported protobuf model (`.pb` file) as a graph in Tensorboard.
+
+  Args:
+    model_dir: The location of the protobuf (`pb`) model to visualize
+    log_dir: The location for the Tensorboard log to begin visualisation from.
+
+  Usage:
+    Call this function with your model location and desired log directory.
+    Launch Tensorboard by pointing it to the log directory.
+    View your imported `.pb` model as a graph.
+  """
+  with session.Session(graph=ops.Graph()) as sess:
+    with gfile.FastGFile(model_dir, "rb") as f:
+      graph_def = graph_pb2.GraphDef()
+      graph_def.ParseFromString(f.read())
+      importer.import_graph_def(graph_def)
+
+    pb_visual_writer = summary.FileWriter(log_dir)
+    pb_visual_writer.add_graph(sess.graph)
+    print("Model Imported. Visualize by running: "
+          "> tensorboard --logdir={}".format(log_dir))
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 1c55c5bf5311b3ffaa81ad6f910682b516fd5f57..e1be3055052aeeb1355bbb71d6232ab8d60cc974 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -14,66 +14,8 @@
 # ==============================================================================
 """Command-line interface to inspect and execute a graph in a SavedModel.
 
-If TensorFlow is installed on your system through pip, the 'saved_model_cli'
-binary can be invoked directly from command line.
-
-At a high level, SavedModel CLI allows users to both inspect and execute
-computations on a MetaGraphDef in a SavedModel. These are done through `show`
-and `run` commands. Following is the usage of the two commands. SavedModel
-CLI will also display these information with -h option.
-
-'show' command usage: saved_model_cli show [-h] --dir DIR [--tag_set TAG_SET]
-                          [--signature_def SIGNATURE_DEF_KEY]
-Examples:
-To show all available tag-sets in the SavedModel:
-  $saved_model_cli show --dir /tmp/saved_model
-
-To show all available SignatureDef keys in a MetaGraphDef specified by its
-tag-set:
-  $saved_model_cli show --dir /tmp/saved_model --tag_set serve
-For a MetaGraphDef with multiple tags in the tag-set, all tags must be passed
-in, separated by ',':
-  $saved_model_cli show --dir /tmp/saved_model --tag_set serve,gpu
-
-To show all inputs and outputs TensorInfo for a specific SignatureDef specified
-by the SignatureDef key in a MetaGraphDef:
-  $saved_model_cli show --dir /tmp/saved_model --tag_set serve
-  --signature_def serving_default
-Example output:
-  The given SavedModel SignatureDef contains the following input(s):
-  inputs['input0'] tensor_info:
-      dtype: DT_FLOAT
-      shape: (-1, 1)
-  inputs['input1'] tensor_info:
-      dtype: DT_FLOAT
-      shape: (-1, 1)
-  The given SavedModel SignatureDef contains the following output(s):
-  outputs['output'] tensor_info:
-      dtype: DT_FLOAT
-      shape: (-1, 1)
-  Method name is: tensorflow/serving/regress
-
-To show all available information in the SavedModel:
-  $saved_model_cli show --dir /tmp/saved_model --all
-
-'run' command usage: saved_model_cli run [-h] --dir DIR --tag_set TAG_SET
-                         --signature_def SIGNATURE_DEF_KEY --inputs INPUTS
-                         [--outdir OUTDIR] [--overwrite]
-Examples:
-To run input tensors from files through a MetaGraphDef and save the output
-tensors to files:
-  $saved_model_cli run --dir /tmp/saved_model --tag_set serve
-  --signature_def serving_default --inputs x:0=/tmp/124.npz,x2=/tmp/123.npy
-  --outdir /tmp/out
-
-To observe the intermediate Tensor values in the runtime graph, use the
---tf_debug flag, e.g.:
-  $saved_model_cli run --dir /tmp/saved_model --tag_set serve
-  --signature_def serving_default --inputs x:0=/tmp/124.npz,x2=/tmp/123.npy
-  --outdir /tmp/out --tf_debug
-
-To build this tool from source, run:
-  $bazel build tensorflow/python/tools:saved_model_cli
+For detailed usages and examples, please refer to:
+https://www.tensorflow.org/programmers_guide/saved_model_cli
 
 """
 
@@ -367,7 +309,7 @@ def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
                                             output_full_path))
 
 
-def preprocess_input_arg_string(inputs_str):
+def preprocess_inputs_arg_string(inputs_str):
   """Parses input arg into dictionary that maps input to file/variable tuple.
 
   Parses input string in the format of, for example,
@@ -375,74 +317,94 @@ def preprocess_input_arg_string(inputs_str):
   dictionary looks like
   {'input_key1': (filename1, variable_name1),
    'input_key2': (file2, None)}
-  , which maps input keys to a tuple of file name and varaible name(None if
+  , which maps input keys to a tuple of file name and variable name(None if
   empty).
 
   Args:
-    inputs_str: A string that specified where to load inputs. Each input is
-        separated by comma.
-        * If the command line arg for inputs is quoted and contains
-            whitespace(s), all whitespaces will be ignored.
+    inputs_str: A string that specified where to load inputs. Inputs are
+    separated by semicolons.
         * For each input key:
-            'input=filename<[variable_name]>'
-        * The "[variable_name]" key is optional. Will be set to None if not
-            specified.
+            '<input_key>=<filename>' or
+            '<input_key>=<filename>[<variable_name>]'
+        * The optional 'variable_name' key will be set to None if not specified.
 
   Returns:
-    A dictionary that maps input keys to a tuple of file name and varaible name.
+    A dictionary that maps input keys to a tuple of file name and variable name.
 
   Raises:
-    RuntimeError: An error when the given input is in a bad format.
+    RuntimeError: An error when the given input string is in a bad format.
   """
   input_dict = {}
-  inputs_raw = inputs_str.split(',')
+  inputs_raw = inputs_str.split(';')
   for input_raw in filter(bool, inputs_raw):  # skip empty strings
-    # Remove quotes and whitespaces
-    input_raw = input_raw.replace('"', '').replace('\'', '').replace(' ', '')
-
     # Format of input=filename[variable_name]'
-    match = re.match(r'^([\w\-]+)=([\w\-.\/]+)\[([\w\-]+)\]$', input_raw)
+    match = re.match(r'([^=]+)=([^\[\]]+)\[([^\[\]]+)\]$', input_raw)
+
     if match:
-      input_dict[match.group(1)] = (match.group(2), match.group(3))
+      input_dict[match.group(1)] = match.group(2), match.group(3)
     else:
       # Format of input=filename'
-      match = re.match(r'^([\w\-]+)=([\w\-.\/]+)$', input_raw)
+      match = re.match(r'([^=]+)=([^\[\]]+)$', input_raw)
       if match:
-        input_dict[match.group(1)] = (match.group(2), None)
+        input_dict[match.group(1)] = match.group(2), None
       else:
         raise RuntimeError(
-            'Input \"%s\" format is incorrect. Please follow \"--inputs '
-            'input_key=file_name[variable_name]\" or input_key=file_name' %
-            input_raw)
+            '--inputs "%s" format is incorrect. Please follow'
+            '"<input_key>=<filename>", or'
+            '"<input_key>=<filename>[<variable_name>]"' % input_raw)
 
   return input_dict
 
 
-def load_inputs_from_input_arg_string(inputs_str):
-  """Parses input arg string and load inputs into a dictionary.
+def preprocess_input_exprs_arg_string(input_exprs_str):
+  """Parses input arg into dictionary that maps input key to python expression.
 
-  Parses input string in the format of, for example,
-  "input1=filename1[variable_name1],input2=filename2" into a
-  dictionary looks like
-  {'input1:0': ndarray_saved_as_variable_name1_in_filename1 ,
-   'input2:0': ndarray_saved_in_filename2}
-  , which maps input keys to a numpy ndarray loaded from file. See Args section
-  for more details on inputs format.
+  Parses input string in the format of 'input_key=<python expression>' into a
+  dictionary that maps each input_key to its python expression.
+
+  Args:
+    input_exprs_str: A string that specifies python expression for input keys.
+    Each input is separated by semicolon. For each input key:
+        'input_key=<python expression>'
+
+  Returns:
+    A dictionary that maps input keys to python expressions.
+
+  Raises:
+    RuntimeError: An error when the given input string is in a bad format.
+  """
+  input_dict = {}
+
+  for input_raw in filter(bool, input_exprs_str.split(';')):
+    if '=' not in input_exprs_str:
+      raise RuntimeError('--input_exprs "%s" format is incorrect. Please follow'
+                         '"<input_key>=<python expression>"' % input_exprs_str)
+    input_key, expr = input_raw.split('=')
+    input_dict[input_key] = expr
+
+  return input_dict
+
+
+def load_inputs_from_input_arg_string(inputs_str, input_exprs_str):
+  """Parses input arg strings and create inputs feed_dict.
+
+  Parses '--inputs' string for inputs to be loaded from file, and parses
+  '--input_exprs' string for inputs to be evaluated from python expression.
 
   Args:
     inputs_str: A string that specified where to load inputs. Each input is
-        separated by comma.
-        * If the command line arg for inputs is quoted and contains
-            whitespace(s), all whitespaces will be ignored.
+        separated by semicolon.
         * For each input key:
-            'input=filename[variable_name]'
+            '<input_key>=<filename>' or
+            '<input_key>=<filename>[<variable_name>]'
+        * The optional 'variable_name' key will be set to None if not specified.
         * File specified by 'filename' will be loaded using numpy.load. Inputs
             can be loaded from only .npy, .npz or pickle files.
         * The "[variable_name]" key is optional depending on the input file type
             as descripted in more details below.
         When loading from a npy file, which always contains a numpy ndarray, the
         content will be directly assigned to the specified input tensor. If a
-        varaible_name is specified, it will be ignored and a warning will be
+        variable_name is specified, it will be ignored and a warning will be
         issued.
         When loading from a npz zip file, user can specify which variable within
         the zip file to load for the input tensor inside the square brackets. If
@@ -453,10 +415,12 @@ def load_inputs_from_input_arg_string(inputs_str):
         to the specified input tensor, else SavedModel CLI will assume a
         dictionary is stored in the pickle file and the value corresponding to
         the variable_name will be used.
+    input_exprs_str: A string that specified python expressions for inputs.
+        * In the format of: '<input_key>=<python expression>'.
+        * numpy module is available as np.
 
   Returns:
-    A dictionary that maps input tensor keys to a numpy ndarray loaded from
-    file.
+    A dictionary that maps input tensor keys to numpy ndarrays.
 
   Raises:
     RuntimeError: An error when a key is specified, but the input file contains
@@ -466,13 +430,14 @@ def load_inputs_from_input_arg_string(inputs_str):
   """
   tensor_key_feed_dict = {}
 
-  for input_tensor_key, (
-      filename,
-      variable_name) in preprocess_input_arg_string(inputs_str).items():
+  inputs = preprocess_inputs_arg_string(inputs_str)
+  input_exprs = preprocess_input_exprs_arg_string(input_exprs_str)
+
+  for input_tensor_key, (filename, variable_name) in inputs.items():
+    data = np.load(filename)
+
     # When a variable_name key is specified for the input file
     if variable_name:
-      data = np.load(filename)
-
       # if file contains a single ndarray, ignore the input name
       if isinstance(data, np.ndarray):
         warnings.warn(
@@ -488,7 +453,6 @@ def load_inputs_from_input_arg_string(inputs_str):
               (filename, variable_name))
     # When no key is specified for the input file.
     else:
-      data = np.load(filename)
       # Check if npz file only contains a single numpy ndarray.
       if isinstance(data, np.lib.npyio.NpzFile):
         variable_name_list = data.files
@@ -500,6 +464,16 @@ def load_inputs_from_input_arg_string(inputs_str):
       else:
         tensor_key_feed_dict[input_tensor_key] = data
 
+  # When input is a python expression:
+  for input_tensor_key, py_expr in input_exprs.items():
+    if input_tensor_key in tensor_key_feed_dict:
+      warnings.warn(
+          'input_key %s has been specified with both --inputs and --input_exprs'
+          ' options. Value in --input_exprs will be used.' % input_tensor_key)
+
+    # ast.literal_eval does not work with numpy expressions
+    tensor_key_feed_dict[input_tensor_key] = eval(py_expr)  # pylint: disable=eval-used
+
   return tensor_key_feed_dict
 
 
@@ -515,7 +489,7 @@ def show(args):
   else:
     # If no tag is specified, display all tag_set, if no signaure_def key is
     # specified, display all SignatureDef keys, else show input output tensor
-    # infomation corresponding to the given SignatureDef key
+    # information corresponding to the given SignatureDef key
     if args.tag_set is None:
       _show_tag_sets(args.dir)
     else:
@@ -530,8 +504,16 @@ def run(args):
 
   Args:
     args: A namespace parsed from command line.
+
+  Raises:
+    AttributeError: An error when neither --inputs nor --input_exprs is passed
+    to run command.
   """
-  tensor_key_feed_dict = load_inputs_from_input_arg_string(args.inputs)
+  if not args.inputs and not args.input_exprs:
+    raise AttributeError(
+        'At least one of --inputs and --input_exprs must be required')
+  tensor_key_feed_dict = load_inputs_from_input_arg_string(
+      args.inputs, args.input_exprs)
   run_saved_model_with_feed_dict(args.dir, args.tag_set, args.signature_def,
                                  tensor_key_feed_dict, args.outdir,
                                  args.overwrite, tf_debug=args.tf_debug)
@@ -559,7 +541,7 @@ def create_parser():
       'MetaGraphDef specified by its tag-set:\n'
       '$saved_model_cli show --dir /tmp/saved_model --tag_set serve\n'
       'For a MetaGraphDef with multiple tags in the tag-set, all tags must be '
-      'passed in, separated by \',\':\n'
+      'passed in, separated by \';\':\n'
       '$saved_model_cli show --dir /tmp/saved_model --tag_set serve,gpu\n\n'
       'To show all inputs and outputs TensorInfo for a specific'
       ' SignatureDef specified by the SignatureDef key in a'
@@ -580,7 +562,7 @@ def create_parser():
   parser_show.add_argument(
       '--all',
       action='store_true',
-      help='if set, will output all infomation in given SavedModel')
+      help='if set, will output all information in given SavedModel')
   parser_show.add_argument(
       '--tag_set',
       type=str,
@@ -601,8 +583,9 @@ def create_parser():
              '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
              '--signature_def serving_default '
              '--inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy'
-             '--outdir=/out\n\n'
-             'For more information about input file format, please see:\n')
+             '--input_exprs \'input3_key=np.ones(2)\' --outdir=/out\n\n'
+             'For more information about input file format, please see:\n'
+             'https://www.tensorflow.org/programmers_guide/saved_model_cli\n')
   parser_run = subparsers.add_parser(
       'run', description=run_msg, formatter_class=argparse.RawTextHelpFormatter)
   parser_run.add_argument(
@@ -621,10 +604,15 @@ def create_parser():
       required=True,
       metavar='SIGNATURE_DEF_KEY',
       help='key of SignatureDef to run')
-  msg = ('inputs in the format of \'input_key=filename[variable_name]\', '
-         'separated by \',\'. Inputs can only be loaded from .npy, .npz or '
-         'pickle files. Please use input keys instead of input names.')
-  parser_run.add_argument('--inputs', type=str, required=True, help=msg)
+  msg = ('Loading inputs from files, in the format of \'<input_key>=<filename>,'
+         ' or \'<input_key>=<filename>[<variable_name>]\', separated by \';\'.'
+         ' The file format can only be from .npy, .npz or pickle.')
+  parser_run.add_argument('--inputs', type=str, default='', help=msg)
+  msg = ('Specifying inputs by python expressions, in the format of'
+         ' "<input_key>=\'<python expression>\'", separated by \';\'. '
+         'numpy module is available as \'np\'. '
+         'Will override duplicate input_keys from --inputs option.')
+  parser_run.add_argument('--input_exprs', type=str, default='', help=msg)
   parser_run.add_argument(
       '--outdir',
       type=str,
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index a321ada2dd531843ad71a51117797caa9e7e796c..8f79c888ebd3c82affde5d17ff0c5db2232a6c46 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -201,28 +201,37 @@ Method name is: tensorflow/serving/predict"""
     self.assertEqual(err.getvalue().strip(), '')
 
   def testInputPreProcessFormats(self):
-    input_str = 'input1=/path/file.txt[ab3], input2=file2,,'
-    input_dict = saved_model_cli.preprocess_input_arg_string(input_str)
+    input_str = 'input1=/path/file.txt[ab3];input2=file2'
+    input_expr_str = 'input3=np.zeros([2,2]);input4=[4,5]'
+    input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_expr_dict = saved_model_cli.preprocess_input_exprs_arg_string(
+        input_expr_str)
     self.assertTrue(input_dict['input1'] == ('/path/file.txt', 'ab3'))
     self.assertTrue(input_dict['input2'] == ('file2', None))
-
-  def testInputPreProcessQuoteAndWhitespace(self):
-    input_str = '\' input1 = file[v_1]\', input2=file ["sd"] '
-    input_dict = saved_model_cli.preprocess_input_arg_string(input_str)
-    self.assertTrue(input_dict['input1'] == ('file', 'v_1'))
-    self.assertTrue(input_dict['input2'] == ('file', 'sd'))
+    self.assertTrue(input_expr_dict['input3'] == 'np.zeros([2,2])')
+    self.assertTrue(input_expr_dict['input4'] == '[4,5]')
     self.assertTrue(len(input_dict) == 2)
+    self.assertTrue(len(input_expr_dict) == 2)
+
+  def testInputPreProcessFileNames(self):
+    input_str = (r'inputx=C:\Program Files\data.npz[v:0];'
+                 r'input:0=c:\PROGRA~1\data.npy')
+    input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
+    print(input_dict)
+    self.assertTrue(input_dict['inputx'] == (r'C:\Program Files\data.npz',
+                                             'v:0'))
+    self.assertTrue(input_dict['input:0'] == (r'c:\PROGRA~1\data.npy', None))
 
   def testInputPreProcessErrorBadFormat(self):
     input_str = 'inputx=file[[v1]v2'
     with self.assertRaises(RuntimeError):
-      saved_model_cli.preprocess_input_arg_string(input_str)
+      saved_model_cli.preprocess_inputs_arg_string(input_str)
     input_str = 'inputx:file'
     with self.assertRaises(RuntimeError):
-      saved_model_cli.preprocess_input_arg_string(input_str)
-    input_str = 'inputx=file(v_1)'
+      saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_str = 'inputx:np.zeros((5))'
     with self.assertRaises(RuntimeError):
-      saved_model_cli.preprocess_input_arg_string(input_str)
+      saved_model_cli.preprocess_input_exprs_arg_string(input_str)
 
   def testInputParserNPY(self):
     x0 = np.array([[1], [2]])
@@ -231,8 +240,8 @@ Method name is: tensorflow/serving/predict"""
     input1_path = os.path.join(test.get_temp_dir(), 'input1.npy')
     np.save(input0_path, x0)
     np.save(input1_path, x1)
-    input_str = 'x0=' + input0_path + '[x0],x1=' + input1_path
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str)
+    input_str = 'x0=' + input0_path + '[x0];x1=' + input1_path
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
     self.assertTrue(np.all(feed_dict['x0'] == x0))
     self.assertTrue(np.all(feed_dict['x1'] == x1))
 
@@ -240,8 +249,8 @@ Method name is: tensorflow/serving/predict"""
     x0 = np.array([[1], [2]])
     input_path = os.path.join(test.get_temp_dir(), 'input.npz')
     np.savez(input_path, a=x0)
-    input_str = 'x=' + input_path + '[a],y=' + input_path
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str)
+    input_str = 'x=' + input_path + '[a];y=' + input_path
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
     self.assertTrue(np.all(feed_dict['x'] == x0))
     self.assertTrue(np.all(feed_dict['y'] == x0))
 
@@ -258,25 +267,50 @@ Method name is: tensorflow/serving/predict"""
       pickle.dump(pkl1, f)
     with open(input_path2, 'wb') as f:
       pickle.dump(pkl2, f)
-    input_str = 'x=' + input_path0 + '[b],y=' + input_path1 + '[c],'
+    input_str = 'x=' + input_path0 + '[b];y=' + input_path1 + '[c];'
     input_str += 'z=' + input_path2
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str)
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
     self.assertTrue(np.all(feed_dict['x'] == pkl0['b']))
     self.assertTrue(np.all(feed_dict['y'] == pkl1))
     self.assertTrue(np.all(feed_dict['z'] == pkl2))
 
-  def testInputParserQuoteAndWhitespace(self):
+  def testInputParserPythonExpression(self):
+    x1 = np.ones([2, 10])
+    x2 = np.array([[1], [2], [3]])
+    x3 = np.mgrid[0:5, 0:5]
+    x4 = [[3], [4]]
+    input_expr_str = ('x1=np.ones([2,10]);x2=np.array([[1],[2],[3]]);'
+                      'x3=np.mgrid[0:5,0:5];x4=[[3],[4]]')
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        '', input_expr_str)
+    self.assertTrue(np.all(feed_dict['x1'] == x1))
+    self.assertTrue(np.all(feed_dict['x2'] == x2))
+    self.assertTrue(np.all(feed_dict['x3'] == x3))
+    self.assertTrue(np.all(feed_dict['x4'] == x4))
+
+  def testInputParserBoth(self):
     x0 = np.array([[1], [2]])
-    x1 = np.array(range(6)).reshape(2, 3)
-    input0_path = os.path.join(test.get_temp_dir(), 'input0.npy')
-    input1_path = os.path.join(test.get_temp_dir(), 'input1.npy')
-    np.save(input0_path, x0)
-    np.save(input1_path, x1)
-    input_str = '"x0=' + input0_path + '[x0] , x1 = ' + input1_path + '"'
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str)
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    x1 = np.ones([2, 10])
+    input_str = 'x0=' + input_path + '[a]'
+    input_expr_str = 'x1=np.ones([2,10])'
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, input_expr_str)
     self.assertTrue(np.all(feed_dict['x0'] == x0))
     self.assertTrue(np.all(feed_dict['x1'] == x1))
 
+  def testInputParserBothDuplicate(self):
+    x0 = np.array([[1], [2]])
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    x1 = np.ones([2, 10])
+    input_str = 'x0=' + input_path + '[a]'
+    input_expr_str = 'x0=np.ones([2,10])'
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, input_expr_str)
+    self.assertTrue(np.all(feed_dict['x0'] == x1))
+
   def testInputParserErrorNoName(self):
     x0 = np.array([[1], [2]])
     x1 = np.array(range(5))
@@ -284,7 +318,7 @@ Method name is: tensorflow/serving/predict"""
     np.savez(input_path, a=x0, b=x1)
     input_str = 'x=' + input_path
     with self.assertRaises(RuntimeError):
-      saved_model_cli.load_inputs_from_input_arg_string(input_str)
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
 
   def testInputParserErrorWrongName(self):
     x0 = np.array([[1], [2]])
@@ -293,7 +327,7 @@ Method name is: tensorflow/serving/predict"""
     np.savez(input_path, a=x0, b=x1)
     input_str = 'x=' + input_path + '[c]'
     with self.assertRaises(RuntimeError):
-      saved_model_cli.load_inputs_from_input_arg_string(input_str)
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
 
   def testRunCommandExistingOutdir(self):
     self.parser = saved_model_cli.create_parser()
@@ -375,6 +409,16 @@ Method name is: tensorflow/serving/predict"""
     with self.assertRaises(RuntimeError):
       saved_model_cli.run(args)
 
+  def testRunCommandInputNotGivenError(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default'
+    ])
+    with self.assertRaises(AttributeError):
+      saved_model_cli.run(args)
+
   def testRunCommandWithDebuggerEnabled(self):
     self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
diff --git a/tensorflow/python/tools/strip_unused_lib.py b/tensorflow/python/tools/strip_unused_lib.py
index 8f9e20ab8e7f1ca564c4cbac1ad069e4d4439fa2..b1d195607604b406f68b28824564afc642cc43ad 100644
--- a/tensorflow/python/tools/strip_unused_lib.py
+++ b/tensorflow/python/tools/strip_unused_lib.py
@@ -41,14 +41,26 @@ def strip_unused(input_graph_def, input_node_names, output_node_names,
         a list that specifies one value per input node name.
 
   Returns:
-    A GraphDef with all unnecessary ops removed.
+    A `GraphDef` with all unnecessary ops removed.
+
+  Raises:
+    ValueError: If any element in `input_node_names` refers to a tensor instead
+      of an operation.
+    KeyError: If any element in `input_node_names` is not found in the graph.
   """
+  for name in input_node_names:
+    if ":" in name:
+      raise ValueError("Name '%s' appears to refer to a Tensor, "
+                       "not a Operation." % name)
+
   # Here we replace the nodes we're going to override as inputs with
   # placeholders so that any unused nodes that are inputs to them are
   # automatically stripped out by extract_sub_graph().
+  not_found = {name for name in input_node_names}
   inputs_replaced_graph_def = graph_pb2.GraphDef()
   for node in input_graph_def.node:
     if node.name in input_node_names:
+      not_found.remove(node.name)
       placeholder_node = node_def_pb2.NodeDef()
       placeholder_node.op = "Placeholder"
       placeholder_node.name = node.name
@@ -67,6 +79,9 @@ def strip_unused(input_graph_def, input_node_names, output_node_names,
     else:
       inputs_replaced_graph_def.node.extend([copy.deepcopy(node)])
 
+  if not_found:
+    raise KeyError("The following input nodes were not found: %s\n" % not_found)
+
   output_graph_def = graph_util.extract_sub_graph(inputs_replaced_graph_def,
                                                   output_node_names)
   return output_graph_def
diff --git a/tensorflow/python/tools/strip_unused_test.py b/tensorflow/python/tools/strip_unused_test.py
index d492a0e8229802b0cc67855ec9c2f7321b49a1b7..7cf0c3e3ed9b5748b263913566150eff8acf857a 100644
--- a/tensorflow/python/tools/strip_unused_test.py
+++ b/tensorflow/python/tools/strip_unused_test.py
@@ -58,16 +58,25 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
     # routine.
     input_graph_path = os.path.join(self.get_temp_dir(), input_graph_name)
     input_binary = False
-    input_node_names = "wanted_input_node"
     output_binary = True
     output_node_names = "output_node"
     output_graph_path = os.path.join(self.get_temp_dir(), output_graph_name)
 
-    strip_unused_lib.strip_unused_from_files(input_graph_path, input_binary,
-                                             output_graph_path, output_binary,
-                                             input_node_names,
-                                             output_node_names,
-                                             dtypes.float32.as_datatype_enum)
+    def strip(input_node_names):
+      strip_unused_lib.strip_unused_from_files(input_graph_path, input_binary,
+                                               output_graph_path, output_binary,
+                                               input_node_names,
+                                               output_node_names,
+                                               dtypes.float32.as_datatype_enum)
+
+    with self.assertRaises(KeyError):
+      strip("does_not_exist")
+
+    with self.assertRaises(ValueError):
+      strip("wanted_input_node:0")
+
+    input_node_names = "wanted_input_node"
+    strip(input_node_names)
 
     # Now we make sure the variable is now a constant, and that the graph still
     # produces the expected result.
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 35d1a069d8a7c17b5a11eb2ce422c12870b81cfd..d52cf9a4367dd7728245cbe4fe35b47dd5c0dd25 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -59,14 +59,14 @@ def load_checkpoint(ckpt_dir_or_file):
 
 
 def load_variable(ckpt_dir_or_file, name):
-  """Returns a tensor with the contents of the given variable in the checkpoint.
+  """Returns the tensor value of the given variable in the checkpoint.
 
   Args:
     ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
-    name: Name of the tensor to return.
+    name: Name of the variable to return.
 
   Returns:
-    `Tensor` object.
+    A numpy `ndarray` with a copy of the value of this variable.
   """
   # TODO(b/29227106): Fix this in the right place and remove this.
   if name.endswith(":0"):
@@ -210,9 +210,8 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
       else:
         var_name = ",".join([v.name for v in var])
       _set_variable_or_list_initializer(var, ckpt_file, tensor_name_in_ckpt)
-      logging.info("Initialize variable %s from checkpoint %s with %s" % (
-          var_name, ckpt_dir_or_file, tensor_name_in_ckpt
-      ))
+      logging.info("Initialize variable %s from checkpoint %s with %s",
+                   var_name, ckpt_dir_or_file, tensor_name_in_ckpt)
     else:
       scopes = ""
       # TODO(vihanjain): Support list of 'current_var_or_name' here.
@@ -250,9 +249,8 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
         if var is None:
           var = _collect_partitioned_variable(var_name, store_vars)
         _set_variable_or_list_initializer(var, ckpt_file, full_tensor_name)
-        logging.info("Initialize variable %s from checkpoint %s with %s" % (
-            var_name, ckpt_dir_or_file, full_tensor_name
-        ))
+        logging.info("Initialize variable %s from checkpoint %s with %s",
+                     var_name, ckpt_dir_or_file, full_tensor_name)
 
 
 def _get_checkpoint_filename(ckpt_dir_or_file):
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index fea2f8240ee4e918ec5961713575f57468811b22..d234df71c1050346ad59e0a536d2b9d7645563e0 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -366,7 +366,7 @@ class Coordinator(object):
     # If any thread is still alive, wait for the grace period to expire.
     # By the time this check is executed, threads may still be shutting down,
     # so we add a sleep of increasing duration to give them a chance to shut
-    # down without loosing too many cycles.
+    # down without losing too many cycles.
     # The sleep duration is limited to the remaining grace duration.
     stop_wait_secs = 0.001
     while any(t.is_alive() for t in threads) and stop_grace_period_secs >= 0.0:
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 85ee10379adab8f0091b73e313759c2ac6594dfa..02155a98d7d6e38cca6df77b52d66828181e593f 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -94,31 +94,31 @@ class _ReplicaDeviceChooser(object):
     Returns:
       The device to use for the `Operation`.
     """
+    # If we don't return early here, either merge_devices is True, or op.device
+    # is empty (in which case merging is a no-op). So we can always merge below.
     if not self._merge_devices and op.device:
       return op.device
+
     current_device = pydev.DeviceSpec.from_string(op.device or "")
-    spec = pydev.DeviceSpec()
-    if self._ps_tasks and self._ps_device:
-      node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
-      if node_def.op in self._ps_ops:
-        device_string = "%s/task:%d" % (
-            self._ps_device, self._ps_strategy(op))
-        if self._merge_devices:
-          spec = pydev.DeviceSpec.from_string(device_string)
-          spec.merge_from(current_device)
-          return spec.to_string()
-        else:
-          return device_string
-    if self._worker_device:
-      if not self._merge_devices:
-        return self._worker_device
-      spec = pydev.DeviceSpec.from_string(self._worker_device)
-
-    if not self._merge_devices:
-      return ""
-
-    spec.merge_from(current_device)
-    return spec.to_string()
+
+    # The ps_device will be used for specified ops (ps_ops) whenever it is
+    # present and ps_tasks is non-zero. However, its task number will only be
+    # set (using ps_strategy) if there is a job field in ps_device that won't be
+    # changed by the job field (if present) in current_device.
+    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
+    if self._ps_tasks and self._ps_device and node_def.op in self._ps_ops:
+      ps_device = pydev.DeviceSpec.from_string(self._ps_device)
+
+      current_job, ps_job = current_device.job, ps_device.job
+      if ps_job and (not current_job or current_job == ps_job):
+        ps_device.task = self._ps_strategy(op)
+
+      ps_device.merge_from(current_device)
+      return ps_device.to_string()
+
+    worker_device = pydev.DeviceSpec.from_string(self._worker_device or "")
+    worker_device.merge_from(current_device)
+    return worker_device.to_string()
 
 
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
@@ -186,7 +186,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
       cluster_spec = cluster.as_dict()
     else:
       cluster_spec = server_lib.ClusterSpec(cluster).as_dict()
-    # Get ps_job_name from ps_device by striping "/job:".
+    # Get ps_job_name from ps_device by stripping "/job:".
     ps_job_name = pydev.DeviceSpec.from_string(ps_device).job
     if ps_job_name not in cluster_spec or cluster_spec[ps_job_name] is None:
       return None
diff --git a/tensorflow/python/training/device_setter_test.py b/tensorflow/python/training/device_setter_test.py
index bc29e0d21c50674851beadc4041f26204f7083e0..85b75502ab0943013f12a34002e72b71d187bf68 100644
--- a/tensorflow/python/training/device_setter_test.py
+++ b/tensorflow/python/training/device_setter_test.py
@@ -65,6 +65,50 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  def testPS2TasksPinVariableToJob(self):
+    with ops.device(
+        device_setter.replica_device_setter(cluster=self._cluster_spec)):
+      v = variables.Variable([1, 2])
+      with ops.device("/job:moon"):
+        w = variables.Variable([2, 1])
+        with ops.device("/job:ps"):  # Explicit PS job will get task set.
+          x = variables.Variable([0, 1])
+      a = v + w + x
+      self.assertDeviceEqual("/job:ps/task:0", v.device)
+      self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
+      self.assertDeviceEqual("/job:moon", w.device)
+      self.assertDeviceEqual("/job:moon", w.initializer.device)
+      self.assertDeviceEqual("/job:ps/task:1", x.device)
+      self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
+      self.assertDeviceEqual("/job:worker", a.device)
+
+  def testPS2TasksUseCpuForPS(self):
+    with ops.device(
+        device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
+      v = variables.Variable([1, 2])
+      with ops.device("/job:moon"):
+        w = variables.Variable([2, 1])
+      a = v + w
+      self.assertDeviceEqual("/cpu:0", v.device)
+      self.assertDeviceEqual("/cpu:0", v.initializer.device)
+      self.assertDeviceEqual("/job:moon/cpu:0", w.device)
+      self.assertDeviceEqual("/job:moon/cpu:0", w.initializer.device)
+      self.assertDeviceEqual("/job:worker", a.device)
+
+  def testPS2TasksNoMerging(self):
+    with ops.device(
+        device_setter.replica_device_setter(
+            cluster=self._cluster_spec, merge_devices=False)):
+      v = variables.Variable([1, 2])
+      with ops.device("/job:ps"):  # Won't assign task when merge_devices=False.
+        w = variables.Variable([2, 1])
+      a = v + w
+      self.assertDeviceEqual("/job:ps/task:0", v.device)
+      self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
+      self.assertDeviceEqual("/job:ps", w.device)
+      self.assertDeviceEqual("/job:ps", w.initializer.device)
+      self.assertDeviceEqual("/job:worker", a.device)
+
   def testPS2TasksWithClusterSpecDict(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec.as_dict(
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 9a2880a1104a6c1bc830efb47a1ec95134a0760c..e9fe9215ae4e3cd61751b3844e9b911d0ca61521 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -197,7 +197,10 @@ def string_input_producer(string_tensor,
     seed: An integer (optional). Seed used if shuffle == True.
     capacity: An integer. Sets the queue capacity.
     shared_name: (optional). If set, this queue will be shared under the given
-      name across multiple sessions.
+      name across multiple sessions. All sessions open to the device which has
+      this queue will be able to access it via the shared_name. Using this in
+      a distributed setting means each name will only be seen by one of the
+      sessions which has access to this operation.
     name: A name for the operations (optional).
     cancel_op: Cancel op for the queue (optional).
 
@@ -876,13 +879,11 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
   `get_shape` method will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
-  Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variables_initializer()` to initialize local variables.
-
   Args:
     tensors: The list or dictionary of tensors to enqueue.
     batch_size: The new batch size pulled from the queue.
-    num_threads: The number of threads enqueuing `tensors`.
+    num_threads: The number of threads enqueuing `tensors`.  The batching will
+      be nondeterministic if `num_threads > 1`.
     capacity: An integer. The maximum number of elements in the queue.
     enqueue_many: Whether each tensor in `tensors` is a single example.
     shapes: (Optional) The shapes for each example.  Defaults to the
@@ -934,7 +935,8 @@ def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
       corresponding value in `keep_input` is `True`. This tensor essentially
       acts as a filtering mechanism.
     batch_size: The new batch size pulled from the queue.
-    num_threads: The number of threads enqueuing `tensors`.
+    num_threads: The number of threads enqueuing `tensors`.  The batching will
+      be nondeterministic if `num_threads > 1`.
     capacity: An integer. The maximum number of elements in the queue.
     enqueue_many: Whether each tensor in `tensors` is a single example.
     shapes: (Optional) The shapes for each example.  Defaults to the
@@ -978,6 +980,9 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
   dictionaries of tensors.  Each element in the list is treated similarly
   to the `tensors` argument of `tf.train.batch()`.
 
+  WARNING: This function is nondeterministic, since it starts a separate thread
+  for each tensor.
+
   Enqueues a different list of tensors in different threads.
   Implemented using a queue -- a `QueueRunner` for the queue
   is added to the current `Graph`'s `QUEUE_RUNNER` collection.
@@ -1173,9 +1178,6 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
   `get_shape` method will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
-  Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variables_initializer()` to initialize local variables.
-
   Args:
     tensors: The list or dictionary of tensors to enqueue.
     batch_size: The new batch size pulled from the queue.
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index e42f3b639cac526a375472452bbf544688cfa016..ffd7c12c427aefc531cd785351993cea05a512e1 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -28,8 +28,11 @@ class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
   Computes (if `use_nesterov = False`):
-    accumulation = momentum * accumulation + gradient
-    variable -= learning_rate * accumulation
+  
+  ```
+  accumulation = momentum * accumulation + gradient
+  variable -= learning_rate * accumulation
+  ```
 
   Note that in the dense version of this algorithm, `accumulation` is updated
   and applied regardless of a gradient's value, whereas the sparse version (when
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 6d6128d2079ab7b0ba93baad5c27e9c480b2df95..7f737399ab49beeabfdff02974a53f78f6e804c9 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -102,7 +102,8 @@ class Scaffold(object):
                ready_for_local_init_op=None,
                local_init_op=None,
                summary_op=None,
-               saver=None):
+               saver=None,
+               copy_from_scaffold=None):
     """Create a scaffold.
 
     Args:
@@ -125,10 +126,26 @@ class Scaffold(object):
         string tensor containing a serialized `Summary` proto.
       saver: Optional `tf.train.Saver` object to use to save and restore
         variables.
+      copy_from_scaffold: Optional scaffold object to copy fields from. Its
+        fields will be overwritten by the provided fields in this function.
     """
+    if copy_from_scaffold:
+      if not isinstance(copy_from_scaffold, Scaffold):
+        raise TypeError('copy_from_scaffold is not a Scaffold instance.')
+      init_op = init_op or copy_from_scaffold.init_op
+      init_feed_dict = init_feed_dict or copy_from_scaffold.init_feed_dict
+      # Use the original init_fn provided by the user to init the new Scaffold.
+      init_fn = init_fn or copy_from_scaffold._user_init_fn  # pylint: disable=protected-access
+      ready_op = ready_op or copy_from_scaffold.ready_op
+      ready_for_local_init_op = ready_for_local_init_op or (
+          copy_from_scaffold.ready_for_local_init_op)
+      local_init_op = local_init_op or copy_from_scaffold.local_init_op
+      summary_op = summary_op or copy_from_scaffold.summary_op
+      saver = saver or copy_from_scaffold.saver
 
     # NOTE(touts): modifying the init function to be passed the scaffold is a
     # hack to make it easy to find the saver.  Is there a better way?
+    self._user_init_fn = init_fn
     if init_fn:
       self._init_fn = lambda sess: init_fn(self, sess)
     else:
@@ -238,7 +255,7 @@ class Scaffold(object):
   @staticmethod
   def _default_local_init_op():
     return control_flow_ops.group(variables.local_variables_initializer(),
-                                  data_flow_ops.tables_initializer())
+                                  lookup_ops.tables_initializer())
 
 
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
@@ -422,7 +439,9 @@ class WorkerSessionCreator(SessionCreator):
   def create_session(self):
     self._scaffold.finalize()
     return self._get_session_manager().wait_for_session(
-        self._master, config=self._config)
+        self._master, config=self._config,
+        max_wait_secs=30 * 60  # Wait up to 30 mins for the session to be ready.
+    )
 
 
 class _MonitoredSession(object):
@@ -557,7 +576,7 @@ class MonitoredSession(_MonitoredSession):
 
   ```python
   saver_hook = CheckpointSaverHook(...)
-  summary_hook = SummaryHook(...)
+  summary_hook = SummarySaverHook(...)
   with MonitoredSession(session_creator=ChiefSessionCreator(...),
                         hooks=[saver_hook, summary_hook]) as sess:
     while not sess.should_stop():
@@ -646,7 +665,7 @@ class SingularMonitoredSession(_MonitoredSession):
   Example usage:
   ```python
   saver_hook = CheckpointSaverHook(...)
-  summary_hook = SummaryHook(...)
+  summary_hook = SummarySaverHook(...)
   with SingularMonitoredSession(hooks=[saver_hook, summary_hook]) as sess:
     while not sess.should_stop():
       sess.run(train_op)
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 41f8fb34869fa520d4b033c933002e044ddea08f..85a5ceeb08f4c658d5bede90a11d01127c531f91 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -147,6 +147,68 @@ class ScaffoldTest(test.TestCase):
                                    'Graph is finalized and cannot be modified'):
         constant_op.constant([0])
 
+  def test_new_scaffold_from_default_scaffold(self):
+    scaffold1 = monitored_session.Scaffold()
+    with ops.Graph().as_default():
+      variables.Variable([1])
+      saver = saver_lib.Saver()
+      scaffold2 = monitored_session.Scaffold(
+          init_op=2,
+          init_feed_dict=3,
+          init_fn=lambda scaffold, sess: 4,
+          ready_op=5,
+          ready_for_local_init_op=6,
+          local_init_op=7,
+          saver=saver,
+          copy_from_scaffold=scaffold1)
+
+      scaffold2.finalize()
+      self.assertEqual(2, scaffold2.init_op)
+      self.assertEqual(3, scaffold2.init_feed_dict)
+      self.assertTrue(callable(scaffold2.init_fn))
+      self.assertEqual(5, scaffold2.ready_op)
+      self.assertEqual(6, scaffold2.ready_for_local_init_op)
+      self.assertEqual(7, scaffold2.local_init_op)
+      self.assertEqual(saver, scaffold2.saver)
+
+  def test_new_scaffold_from_existing_scaffold(self):
+    with ops.Graph().as_default():
+      variables.Variable([1])
+      saver = saver_lib.Saver()
+      scaffold1 = monitored_session.Scaffold(
+          init_op=2,
+          init_feed_dict=3,
+          init_fn=lambda scaffold, sess: 4,
+          ready_op=5,
+          ready_for_local_init_op=6,
+          local_init_op=7,
+          saver=saver)
+
+      scaffold2 = monitored_session.Scaffold(
+          init_op=4,
+          init_feed_dict=6,
+          init_fn=lambda scaffold, sess: 8,
+          ready_op=10,
+          ready_for_local_init_op=12,
+          local_init_op=14,
+          saver=saver,
+          copy_from_scaffold=scaffold1)
+
+      scaffold2.finalize()
+      self.assertEqual(4, scaffold2.init_op)
+      self.assertEqual(6, scaffold2.init_feed_dict)
+      self.assertTrue(callable(scaffold2.init_fn))
+      self.assertEqual(10, scaffold2.ready_op)
+      self.assertEqual(12, scaffold2.ready_for_local_init_op)
+      self.assertEqual(14, scaffold2.local_init_op)
+      self.assertEqual(saver, scaffold2.saver)
+
+  def test_copy_from_scaffold_is_scaffold(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          TypeError, 'copy_from_scaffold is not a Scaffold instance'):
+        monitored_session.Scaffold(copy_from_scaffold=1)
+
 
 def _test_dir(temp_dir, test_name):
   """Create an empty dir to use for tests.
diff --git a/tensorflow/python/training/quantize_training.i b/tensorflow/python/training/quantize_training.i
index 59cc895084acfeaf01c3ec3bf18a3721be7131ae..40c60769731d3f7255647a07141d86b1c2594b01 100644
--- a/tensorflow/python/training/quantize_training.i
+++ b/tensorflow/python/training/quantize_training.i
@@ -24,8 +24,10 @@ static PyObject* DoQuantizeTrainingOnGraphDefHelper(
     int num_bits,
     TF_Status* out_status) {
   string result;
+  // TODO(suharshs): Make the QuantizeAndDequantizeV2 configurable.
   tensorflow::Status status =
-      tensorflow::DoQuantizeTrainingOnSerializedGraphDef(input_graph, num_bits, &result);
+      tensorflow::DoQuantizeTrainingOnSerializedGraphDef(input_graph, num_bits,
+      "QuantizeAndDequantizeV2", &result);
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
     Py_RETURN_NONE;
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 43b61742467ed2a72e95c1c6f7f72a70469aa591..e9cc76aa9079f1ff23b3b3da32ff9abf054079a7 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -935,11 +935,11 @@ def get_checkpoint_state(checkpoint_dir, latest_filename=None):
           ckpt.all_model_checkpoint_paths[i] = os.path.join(checkpoint_dir, p)
   except errors.OpError as e:
     # It's ok if the file cannot be read
-    logging.warning(str(e))
+    logging.warning("%s: %s", type(e).__name__, e)
     logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
     return None
   except text_format.ParseError as e:
-    logging.warning(str(e))
+    logging.warning("%s: %s", type(e).__name__, e)
     logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
     return None
   finally:
@@ -1461,28 +1461,31 @@ class Saver(object):
             "'latest_filename' collides with 'save_path': '%s' and '%s'" %
             (latest_filename, save_path))
 
-    if not gfile.IsDirectory(os.path.dirname(save_path)):
-      raise ValueError(
-          "Parent directory of {} doesn't exist, can't save.".format(save_path))
-
-    save_path = os.path.dirname(save_path)
     if not isinstance(sess, session.SessionInterface):
       raise TypeError("'sess' must be a Session; %s" % sess)
 
+    save_path_parent = os.path.dirname(save_path)
     if not self._is_empty:
-      model_checkpoint_path = sess.run(
-          self.saver_def.save_tensor_name,
-          {self.saver_def.filename_tensor_name: checkpoint_file})
-      model_checkpoint_path = compat.as_str(model_checkpoint_path)
-      if write_state:
-        self._MaybeDeleteOldCheckpoints(
-            model_checkpoint_path, meta_graph_suffix=meta_graph_suffix)
-        _update_checkpoint_state(
-            save_dir=save_path,
-            model_checkpoint_path=model_checkpoint_path,
-            all_model_checkpoint_paths=self.last_checkpoints,
-            latest_filename=latest_filename,
-            save_relative_paths=self._save_relative_paths)
+      try:
+        model_checkpoint_path = sess.run(
+            self.saver_def.save_tensor_name,
+            {self.saver_def.filename_tensor_name: checkpoint_file})
+        model_checkpoint_path = compat.as_str(model_checkpoint_path)
+        if write_state:
+          self._MaybeDeleteOldCheckpoints(
+              model_checkpoint_path, meta_graph_suffix=meta_graph_suffix)
+          _update_checkpoint_state(
+              save_dir=save_path_parent,
+              model_checkpoint_path=model_checkpoint_path,
+              all_model_checkpoint_paths=self.last_checkpoints,
+              latest_filename=latest_filename,
+              save_relative_paths=self._save_relative_paths)
+      except (errors.FailedPreconditionError, errors.NotFoundError) as exc:
+        if not gfile.IsDirectory(save_path_parent):
+          exc = ValueError(
+              "Parent directory of {} doesn't exist, can't save.".format(
+                  save_path))
+        raise exc
 
     if write_meta_graph:
       meta_graph_filename = self._MetaGraphFilename(
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index ec370afee112563cc5130bb2cdb295cf81714d36..5d1f434a5623cc7dd7569c12d1747c51b4678184 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -555,32 +555,46 @@ class SaverTest(test.TestCase):
     self.testSaveWithGlobalStep(pad_step_number=True)
 
   def testSaveToNonexistingPath(self):
+    file_io.write_string_to_file(
+        os.path.join(self.get_temp_dir(), "actually_a_file"), "")
+    paths = [
+        os.path.join(self.get_temp_dir(), "nonexisting_dir/path"),
+        os.path.join(self.get_temp_dir(), "other_nonexisting_dir/path1/path2"),
+        os.path.join(self.get_temp_dir(), "actually_a_file/path"),
+    ]
+
+    for save_path in paths:
+      # Build a graph with 2 parameter nodes, and Save and
+      # Restore nodes for them.
+      v0 = variables.Variable(10.0, name="v0")
+      v1 = variables.Variable(20.0, name="v1")
+      save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
+      init_all_op = variables.global_variables_initializer()
 
-    save_path = os.path.join(self.get_temp_dir(), "nonexisting_dir/path")
-
-    # Build a graph with 2 parameter nodes, and Save and
-    # Restore nodes for them.
-    v0 = variables.Variable(10.0, name="v0")
-    v1 = variables.Variable(20.0, name="v1")
-    save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
-    init_all_op = variables.global_variables_initializer()
-
-    with self.test_session() as sess:
-      # Initialize all variables
-      sess.run(init_all_op)
+      # In the case where the parent directory doesn't exist, whether or not the
+      # save succeeds or fails is implementation dependent.  Therefore we allow
+      # both cases.
+      try:
+        with self.test_session() as sess:
+          # Initialize all variables
+          sess.run(init_all_op)
 
-      # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
+          # Check that the parameter nodes have been initialized.
+          self.assertEqual(10.0, v0.eval())
+          self.assertEqual(20.0, v1.eval())
 
-      error_msg_template = "Parent directory of {} doesn't exist, can't save."
+          # Save the graph.
+          save.save(sess, save_path)
 
-      # Assert saving fails when parent dir of save path doesn't exist
-      with self.assertRaisesWithPredicateMatch(
-          ValueError,
-          lambda e: error_msg_template.format(save_path) in str(e)
-      ):
-        save.save(sess, save_path)
+        with self.test_session() as sess:
+          # Restore the saved values in the parameter nodes.
+          save.restore(sess, save_path)
+          # Check that the parameter nodes have been restored.
+          self.assertEqual(10.0, v0.eval())
+          self.assertEqual(20.0, v1.eval())
+      except ValueError as exc:
+        error_msg_template = "Parent directory of {} doesn't exist, can't save."
+        self.assertEqual(error_msg_template.format(save_path), str(exc))
 
   def testSaveToURI(self):
     # ParseURI functions don't work on Windows yet.
@@ -1845,8 +1859,8 @@ class MetaGraphTest(test.TestCase):
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(
           meta_graph_def, clear_devices=False, import_scope="new_model")
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Cannot assign a device to node"):
+      # Device refers to GPU, which is not available here.
+      with self.assertRaises(errors_impl.InvalidArgumentError):
         sess.run(variables.global_variables_initializer())
 
     with session.Session(graph=ops_lib.Graph()) as sess:
@@ -2073,6 +2087,18 @@ class ScopedGraphTest(test.TestCase):
         biases3 = variables.Variable(array_ops.zeros([10]), name="biases")
         logits = math_ops.matmul(hidden2, weights3) + biases3
         ops_lib.add_to_collection("logits", logits)
+
+        # Adds user_defined proto in three formats: string, bytes and Any.
+        # Any proto should just pass through.
+        queue_runner = queue_runner_pb2.QueueRunnerDef(queue_name="test_queue")
+        ops_lib.add_to_collection("user_defined_string_collection",
+                                  str(queue_runner))
+        ops_lib.add_to_collection("user_defined_bytes_collection",
+                                  queue_runner.SerializeToString())
+        any_buf = Any()
+        any_buf.Pack(queue_runner)
+        ops_lib.add_to_collection("user_defined_any_collection", any_buf)
+
       _, var_list = meta_graph.export_scoped_meta_graph(
           filename=os.path.join(test_dir, exported_filename),
           graph=ops_lib.get_default_graph(),
diff --git a/tensorflow/python/training/saver_test_utils.py b/tensorflow/python/training/saver_test_utils.py
index 5f31e2aa539d25ba4fa4a76f4441f8b6f7e11e62..6a73565f82bf373836adca87a4af17ebe2641f8b 100644
--- a/tensorflow/python/training/saver_test_utils.py
+++ b/tensorflow/python/training/saver_test_utils.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
-from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import gen_lookup_ops
 from tensorflow.python.training import saver as saver_module
 
 
@@ -34,7 +34,7 @@ class CheckpointedOp(object):
   # pylint: disable=protected-access
   def __init__(self, name, table_ref=None):
     if table_ref is None:
-      self.table_ref = gen_data_flow_ops._mutable_hash_table(
+      self.table_ref = gen_lookup_ops._mutable_hash_table(
           key_dtype=dtypes.string, value_dtype=dtypes.float32, name=name)
     else:
       self.table_ref = table_ref
@@ -52,10 +52,10 @@ class CheckpointedOp(object):
     return self._saveable
 
   def insert(self, keys, values):
-    return gen_data_flow_ops._lookup_table_insert(self.table_ref, keys, values)
+    return gen_lookup_ops._lookup_table_insert(self.table_ref, keys, values)
 
   def lookup(self, keys, default):
-    return gen_data_flow_ops._lookup_table_find(self.table_ref, keys, default)
+    return gen_lookup_ops._lookup_table_find(self.table_ref, keys, default)
 
   def keys(self):
     return self._export()[0]
@@ -64,8 +64,8 @@ class CheckpointedOp(object):
     return self._export()[1]
 
   def _export(self):
-    return gen_data_flow_ops._lookup_table_export(self.table_ref, dtypes.string,
-                                                  dtypes.float32)
+    return gen_lookup_ops._lookup_table_export(self.table_ref, dtypes.string,
+                                               dtypes.float32)
 
   class CustomSaveable(saver_module.BaseSaverBuilder.SaveableObject):
     """A custom saveable for CheckpointedOp."""
@@ -81,6 +81,6 @@ class CheckpointedOp(object):
       super(CheckpointedOp.CustomSaveable, self).__init__(table, specs, name)
 
     def restore(self, restore_tensors, shapes):
-      return gen_data_flow_ops._lookup_table_import(
+      return gen_lookup_ops._lookup_table_import(
           self.op.table_ref, restore_tensors[0], restore_tensors[1])
   # pylint: enable=protected-access
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index d2ccf37d8856dcc9d2399b67f9c0799539c956d5..2091eca0b9c6f0af4a043a4639b6fb72b90cef56 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
@@ -276,14 +277,14 @@ class ClusterSpec(object):
                           "from integers to strings." % job_name)
         self._cluster_spec[job_name] = job_tasks
       self._make_cluster_def()
-    elif isinstance(cluster, tensorflow_server_pb2.ClusterDef):
+    elif isinstance(cluster, cluster_pb2.ClusterDef):
       self._cluster_def = cluster
       self._cluster_spec = {}
       for job_def in self._cluster_def.job:
         self._cluster_spec[job_def.name] = {
             i: t for i, t in job_def.tasks.items()}
     elif isinstance(cluster, ClusterSpec):
-      self._cluster_def = tensorflow_server_pb2.ClusterDef()
+      self._cluster_def = cluster_pb2.ClusterDef()
       self._cluster_def.MergeFrom(cluster.as_cluster_def())
       self._cluster_spec = {}
       for job_def in self._cluster_def.job:
@@ -440,7 +441,7 @@ class ClusterSpec(object):
       TypeError: If `cluster_spec` is not a dictionary mapping strings to lists
         of strings.
     """
-    self._cluster_def = tensorflow_server_pb2.ClusterDef()
+    self._cluster_def = cluster_pb2.ClusterDef()
 
     # NOTE(mrry): Sort by job_name to produce deterministic protobufs.
     for job_name, tasks in sorted(self._cluster_spec.items()):
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 6bcc6e25c363d60e0f338ef71807da0758ec477e..a13b6dd976a835d14c03ed90f40b172e0bcbfd07 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -27,6 +27,23 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_mod
 
 
+def _maybe_name(obj):
+  """Returns object name if it has one, or a message otherwise.
+
+  This is useful for names that apper in error messages.
+  Args:
+    obj: Object to get the name of.
+  Returns:
+    name, "None", or a "no name" message.
+  """
+  if obj is None:
+    return "None"
+  elif hasattr(obj, "name"):
+    return obj.name
+  else:
+    return "<no name for %s>" % type(obj)
+
+
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
@@ -267,8 +284,8 @@ class SessionManager(object):
     if not local_init_success:
       raise RuntimeError(
           "Init operations did not make model ready for local_init.  "
-          "Init op: %s, init fn: %s, error: %s" % ("None" if init_op is None
-                                                   else init_op.name, init_fn,
+          "Init op: %s, init fn: %s, error: %s" % (_maybe_name(init_op),
+                                                   init_fn,
                                                    msg))
 
     is_ready, msg = self._model_ready(sess)
@@ -276,8 +293,7 @@ class SessionManager(object):
       raise RuntimeError(
           "Init operations did not make model ready.  "
           "Init op: %s, init fn: %s, local_init_op: %s, error: %s" %
-          (None if init_op is None else init_op.name, init_fn,
-           self._local_init_op, msg))
+          (_maybe_name(init_op), init_fn, self._local_init_op, msg))
     return sess
 
   def recover_session(self,
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 246e95110a6f081e70187e60acabe7117b8afd2f..4dc1d5abb71f9d7b8d63da016876bcec84edd9eb 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -497,6 +497,23 @@ class SessionManagerTest(test.TestCase):
                                    "Init operations did not make model ready"):
         sm2.prepare_session("", init_op=v.initializer)
 
+  def testPrepareSessionDidNotInitLocalVariableList(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1, name="v")
+      w = variables.Variable(
+          v,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          name="w")
+      with self.test_session():
+        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+      sm2 = session_manager.SessionManager(
+          ready_op=variables.report_uninitialized_variables())
+      with self.assertRaisesRegexp(RuntimeError,
+                                   "Init operations did not make model ready"):
+        sm2.prepare_session("", init_op=[v.initializer])
+
   def testPrepareSessionWithReadyNotReadyForLocal(self):
     with ops.Graph().as_default():
       v = variables.Variable(1, name="v")
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 93e64b4ab0bd9b4d6e87887b3aafad59c0cdcdc8..230ed1db6874da6bbb106f687da616cda1f896f9 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as _summary
@@ -426,8 +426,10 @@ class Supervisor(object):
       local_init_op = self._get_first_op_from_collection(
           ops.GraphKeys.LOCAL_INIT_OP)
       if local_init_op is None:
-        op_list = [variables.local_variables_initializer(),
-                   data_flow_ops.tables_initializer()]
+        op_list = [
+            variables.local_variables_initializer(),
+            lookup_ops.tables_initializer()
+        ]
         if op_list:
           local_init_op = control_flow_ops.group(*op_list)
           ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
@@ -994,7 +996,7 @@ class SVSummaryThread(coordinator.LooperThread):
       summary_strs = self._sess.run(self._sv.summary_op)
       global_step = None
     if self._sv.summary_writer:
-      logging.info("Recording summary at step %d.", global_step)
+      logging.info("Recording summary at step %s.", global_step)
       self._sv.summary_writer.add_summary(summary_strs, global_step)
 
 
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index bdf3d9c017523072392ed238c17918b4e33e390e..f4ac3c9758712182d2aee26a1a53c83e92e97b63 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -186,8 +186,8 @@ from tensorflow.python.training.learning_rate_decay import *
 # pylint: enable=wildcard-import
 
 # Distributed computing support.
-from tensorflow.core.protobuf.tensorflow_server_pb2 import ClusterDef
-from tensorflow.core.protobuf.tensorflow_server_pb2 import JobDef
+from tensorflow.core.protobuf.cluster_pb2 import ClusterDef
+from tensorflow.core.protobuf.cluster_pb2 import JobDef
 from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
 from tensorflow.python.training.server_lib import ClusterSpec
 from tensorflow.python.training.server_lib import Server
@@ -196,32 +196,32 @@ from tensorflow.python.training.server_lib import Server
 _allowed_symbols = [
     # TODO(cwhipkey): review these and move to contrib or expose through
     # documentation.
-    "generate_checkpoint_state_proto",   # Used internally by saver.
+    "generate_checkpoint_state_proto",  # Used internally by saver.
     "checkpoint_exists",  # Only used in test?
     "get_checkpoint_mtimes",  # Only used in test?
 
     # Legacy: remove.
     "do_quantize_training_on_graphdef",  # At least use grah_def, not graphdef.
-                                         # No uses within tensorflow.
+    # No uses within tensorflow.
     "queue_runner",  # Use tf.train.start_queue_runner etc directly.
-                     # This is also imported internally.
+    # This is also imported internally.
 
     # TODO(drpng): document these. The reference in howtos/distributed does
     # not link.
     "SyncReplicasOptimizer",
     # Protobufs:
-    "BytesList",          # from example_pb2.
+    "BytesList",  # from example_pb2.
     "ClusterDef",
-    "Example",            # from example_pb2
-    "Feature",            # from example_pb2
-    "Features",           # from example_pb2
-    "FeatureList",        # from example_pb2
-    "FeatureLists",       # from example_pb2
-    "FloatList",          # from example_pb2.
-    "Int64List",          # from example_pb2.
+    "Example",  # from example_pb2
+    "Feature",  # from example_pb2
+    "Features",  # from example_pb2
+    "FeatureList",  # from example_pb2
+    "FeatureLists",  # from example_pb2
+    "FloatList",  # from example_pb2.
+    "Int64List",  # from example_pb2.
     "JobDef",
-    "SaverDef",           # From saver_pb2.
-    "SequenceExample",    # from example_pb2.
+    "SaverDef",  # From saver_pb2.
+    "SequenceExample",  # from example_pb2.
     "ServerDef",
 ]
 # Include extra modules for docstrings because:
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 73fc3e2408791304c73c323cf500967e6fab98c9..1e1599afb4bd9a3e8f6184748178d613ed34cc22 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -182,7 +182,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     return d
 
   def _get_deprecated_positional_arguments(names_to_ok_vals, arg_spec):
-    """Builds a dictionary from deprecated arguments to thier spec.
+    """Builds a dictionary from deprecated arguments to their spec.
 
     Returned dict is keyed by argument name.
     Each value is a DeprecatedArgSpec with the following fields:
diff --git a/tensorflow/python/util/example_parser_configuration.py b/tensorflow/python/util/example_parser_configuration.py
index 8843016a978cb1084b7e851b67d2f1dc2a190619..a3750851769a31466eebba5cfd5e665f4cbc4f9c 100644
--- a/tensorflow/python/util/example_parser_configuration.py
+++ b/tensorflow/python/util/example_parser_configuration.py
@@ -101,7 +101,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     fixed_config.shape.CopyFrom(
         tensor_shape.TensorShape(dense_shapes[i]).as_proto())
 
-    fixed_config.dtype = dense_types[i]
+    fixed_config.dtype = int(dense_types[i])
     # Get the output tensor name.
     fixed_config.values_output_tensor_name = parse_example_op.outputs[
         dense_values_start + i].name
@@ -111,7 +111,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     key = fetched[sparse_keys_start + i]
     feature_config = config.feature_map[key]
     var_len_feature = feature_config.var_len_feature
-    var_len_feature.dtype = sparse_types[i]
+    var_len_feature.dtype = int(sparse_types[i])
     var_len_feature.indices_output_tensor_name = parse_example_op.outputs[
         sparse_indices_start + i].name
     var_len_feature.values_output_tensor_name = parse_example_op.outputs[
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index a6a1ad48920192af22e5f1914556b9636832da5d..88df3351e66711632dcf74bd8875d6d89fabf908 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -22,6 +22,7 @@ import traceback
 import types
 
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import tf_decorator
 
 
 def _add_should_use_warning(x, fatal_error=False):
@@ -36,6 +37,13 @@ def _add_should_use_warning(x, fatal_error=False):
     An instance of `TFShouldUseWarningWrapper` which subclasses `type(x)`
     and is a very shallow wrapper for `x` which logs access into `x`.
   """
+  if x is None:  # special corner case where x is None
+    return x
+  has_been_used = getattr(x, '_tf_object_has_been_used', None)
+  if has_been_used is not None:
+    x._tf_object_has_been_used = has_been_used  # pylint: disable=protected-access
+    return x
+
   def override_method(method):
     def fn(self, *args, **kwargs):
       self._tf_object_has_been_used = True  # pylint: disable=protected-access
@@ -67,18 +75,27 @@ def _add_should_use_warning(x, fatal_error=False):
         else:
           logger = tf_logging.error
         logger(
-            'Object was never used: %s.\nIt was originally created here:\n%s'
-            % (self, self._tf_object_creation_stack))
+            '==================================\n'
+            'Object was never used (type %s):\n%s\nIf you want to mark it as '
+            'used call its "mark_used()" method.\nIt was originally created '
+            'here:\n%s\n'
+            '==================================' %
+            (type(x), x, self._tf_object_creation_stack))
 
       if hasattr(super(TFShouldUseWarningWrapper, self), '__del__'):
         return super(TFShouldUseWarningWrapper, self).__del__()
+
+    def mark_used(self, *args, **kwargs):
+      self._tf_object_has_been_used = True
+      if hasattr(super(TFShouldUseWarningWrapper, self), 'mark_used'):
+        return super(TFShouldUseWarningWrapper, self).mark_used(*args, **kwargs)
     # pylint: enable=super-on-old-class
 
   for name in dir(TFShouldUseWarningWrapper):
     method = getattr(TFShouldUseWarningWrapper, name)
     if not isinstance(method, types.FunctionType):
       continue
-    if name in ('__init__', '__getattribute__', '__del__'):
+    if name in ('__init__', '__getattribute__', '__del__', 'mark_used'):
       continue
     setattr(TFShouldUseWarningWrapper, name,
             functools.wraps(method)(override_method(method)))
@@ -114,7 +131,13 @@ def should_use_result(fn):
   """
   def wrapped(*args, **kwargs):
     return _add_should_use_warning(fn(*args, **kwargs))
-  return functools.wraps(fn)(wrapped)
+  return tf_decorator.make_decorator(
+      fn, wrapped, 'should_use_result',
+      ((fn.__doc__ or '') +
+       ('\n\n  '
+        '**NOTE** The output of this function should be used.  If it is not, '
+        'a warning will be logged.  To mark the output as used, '
+        'call its .mark_used() method.')))
 
 
 def must_use_result_or_fatal(fn):
@@ -142,4 +165,10 @@ def must_use_result_or_fatal(fn):
   """
   def wrapped(*args, **kwargs):
     return _add_should_use_warning(fn(*args, **kwargs), fatal_error=True)
-  return functools.wraps(fn)(wrapped)
+  return tf_decorator.make_decorator(
+      fn, wrapped, 'must_use_result_or_fatal',
+      ((fn.__doc__ or '') +
+       ('\n\n  '
+        '**NOTE** The output of this function must be used.  If it is not, '
+        'a fatal error will be raised.  To mark the output as used, '
+        'call its .mark_used() method.')))
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index 09130eed3a51201e9e2290003a4b5e71d88a50f9..71d48e3dde308c9af59b8dce6a06c4c7d587e24a 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -52,7 +52,7 @@ class TfShouldUseTest(test.TestCase):
         h = tf_should_use._add_should_use_warning(c)
         del h
       in_this_function()
-    self.assertIn('Object was never used:', '\n'.join(captured))
+    self.assertIn('Object was never used', '\n'.join(captured))
     self.assertIn('blah:0', '\n'.join(captured))
     self.assertIn('in_this_function', '\n'.join(captured))
 
@@ -63,7 +63,7 @@ class TfShouldUseTest(test.TestCase):
       h = tf_should_use._add_should_use_warning(c)
       fn(h)
       del h
-    self.assertNotIn('Object was never used:', '\n'.join(captured))
+    self.assertNotIn('Object was never used', '\n'.join(captured))
     self.assertNotIn('blah:0', '\n'.join(captured))
 
   def testAddShouldUseWarningWhenUsedWithAdd(self):
@@ -83,7 +83,7 @@ class TfShouldUseTest(test.TestCase):
     captured = []
     with reroute_error(captured):
       return_const(0.0)
-    self.assertIn('Object was never used:', '\n'.join(captured))
+    self.assertIn('Object was never used', '\n'.join(captured))
     self.assertIn('blah:0', '\n'.join(captured))
     self.assertIn('return_const', '\n'.join(captured))
 
@@ -99,7 +99,7 @@ class TfShouldUseTest(test.TestCase):
         # unused op as being "used".
         v = constant_op.constant(1.0, name='meh')
         v.eval()
-    self.assertIn('Object was never used:', '\n'.join(captured))
+    self.assertIn('Object was never used', '\n'.join(captured))
     self.assertIn('blah:0', '\n'.join(captured))
     self.assertIn('return_const', '\n'.join(captured))
 
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index e441321fc86da830a9f5212d9a6a89763d140344..76778dbeececdd476ce6dce1814c8d2845bfbfc8 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 
-#include <map>
 #include <stdint.h>
 #include <stdlib.h>
+#include <map>
 #include <set>
+#include <utility>
 
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/lib/casts.h"
@@ -227,7 +228,7 @@ string ToString(CUresult result) {
 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
 // context behind our backs).
 CUcontext CurrentContext() {
-  CUcontext current  = CUDADriver::CurrentContextOrDie();
+  CUcontext current = CUDADriver::CurrentContextOrDie();
   if (current != nullptr && !CreatedContexts::Has(current)) {
     LOG(FATAL) << "current context was not created by the StreamExecutor "
                   "cuda_driver API: "
@@ -453,7 +454,8 @@ static port::Status InternalInit() {
   return true;
 }
 
-bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
+bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
+                                 int *flags) {
   static_assert(DeviceOptions::kMask == 0xf,
                 "needs update for new device options");
 
@@ -480,27 +482,56 @@ bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
     CUdevice device, DeviceOptions device_options, CudaContext** context) {
   *context = nullptr;
 
-  CUcontext former_context = CurrentContext();
-  if (former_context != nullptr) {
-    LOG(WARNING) << "creating context when one is currently active; existing: "
-                 << former_context;
-  }
-
   int flags = 0;
   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
     LOG(WARNING) << "could not convert all device options into context flags";
   }
 
   CUresult res;
+  CUcontext former_context;
   CUcontext new_context;
   {
     // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
     // context creation: see http://b/13248943
 
 #if CUDA_VERSION >= 7000
-    res = cuDevicePrimaryCtxSetFlags(device, flags);
+    {
+      unsigned int former_primary_context_flags;
+      int former_primary_context_is_active;
+      CHECK_EQ(CUDA_SUCCESS,
+               cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+                                          &former_primary_context_is_active));
+      if (former_primary_context_flags != flags) {
+        if (former_primary_context_is_active) {
+          LOG(ERROR)
+              << "The primary context is active and has a different flag set ("
+              << former_primary_context_flags << ") than the desired flag set ("
+              << flags << ").";
+        } else {
+          CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
+        }
+      }
+    }
+
+    former_context = CUDADriver::CurrentContextOrDie();
     res = cuDevicePrimaryCtxRetain(&new_context, device);
+    if (former_context != nullptr) {
+      if (former_context == new_context) {
+        VLOG(2) << "The primary context " << former_context
+                << " exists before initializing the StreamExecutor.";
+      } else {
+        LOG(WARNING) << "A non-primary context " << former_context
+                     << " exists before initializing the StreamExecutor. We "
+                        "haven't verified StreamExecutor works with that.";
+      }
+    }
 #else
+    former_context = CurrentContext();
+    if (former_context != nullptr) {
+      LOG(WARNING)
+          << "creating context when one is currently active; existing: "
+          << former_context;
+    }
     res = cuCtxCreate(&new_context, flags, device);
 #endif
   }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 1bb90afd63e2eee3a51b057cebeff2c3cb2eac8f..c1e72bb56550285a20c4731e3eb41a1fd0623a22 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -67,14 +67,6 @@ limitations under the License.
 extern bool FLAGS_check_gpu_leaks;
 bool FLAGS_prefer_cubin_to_ptx = true;
 
-namespace perftools {
-namespace gputools {
-namespace rng {
-class RngSupport;
-}  // namespace rng
-}  // namespace gputools
-}  // namespace perftools
-
 namespace perftools {
 namespace gputools {
 namespace cuda {
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 9d386b5ed9ef891751b59c560f3fa1696166d77e..6c5b9dca90b8be632d084aff46657132807b8ea5 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -35,17 +35,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
-namespace blas {
-class BlasSupport;
-}
-namespace internal {
-class RngSupport;
-}  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
-
 namespace perftools {
 namespace gputools {
 namespace cuda {
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 06278d515254cc7fadced3da94a8eb4ae9829245..bb423e390aa7ab32b3b388ae747a0e5d7856484a 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -135,7 +135,7 @@ class StatusOr {
   // operators, to support move-only types and avoid unnecessary copying.
   StatusOr(T&& value);  // NOLINT
 
-  // Move conversion operator to avoid unecessary copy.
+  // Move conversion operator to avoid unnecessary copy.
   // T must be assignable from U.
   // Not marked with explicit so the implicit conversion can happen.
   template <typename U>
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 42fcd5867cae4f8306174afb32f439c3cebe13bf..fe5da12639fdb73b18e9b5526b00e101dd509e25 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 #include <atomic>
+#include <utility>
 
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/fft.h"
@@ -204,7 +205,7 @@ StreamExecutor::~StreamExecutor() {
 port::Status StreamExecutor::Init(int device_ordinal,
                                   DeviceOptions device_options) {
   device_ordinal_ = device_ordinal;
-  return implementation_->Init(device_ordinal, device_options);
+  return implementation_->Init(device_ordinal, std::move(device_options));
 }
 
 port::Status StreamExecutor::Init() {
@@ -619,7 +620,7 @@ bool StreamExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
 
 bool StreamExecutor::HostCallback(Stream *stream,
                                   std::function<void()> callback) {
-  return implementation_->HostCallback(stream, callback);
+  return implementation_->HostCallback(stream, std::move(callback));
 }
 
 port::Status StreamExecutor::AllocateEvent(Event *event) {
@@ -689,7 +690,7 @@ bool StreamExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
 }
 
 void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {
-  background_threads_->Schedule(task);
+  background_threads_->Schedule(std::move(task));
 }
 
 void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index c9e997044c7d0858ed1434c927a290b550ccfdfe..20be8593cb3c96c732a377c0ad8a259e71514f1d 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -303,19 +303,14 @@ events. This behavior may be disabled with the flag
 
 ### How can I export data from TensorBoard?
 
-If you'd like to export data to visualize elsewhere (e.g. iPython Notebook),
-that's possible too. You can directly depend on the underlying classes that
-TensorBoard uses for loading data: `python/summary/event_accumulator.py` (for
-loading data from a single run) or `python/summary/event_multiplexer.py` (for
-loading data from multiple runs, and keeping it organized). These classes load
-groups of event files, discard data that was "orphaned" by TensorFlow crashes,
-and organize the data by tag.
-
-As another option, there is a script
-(`tensorboard/scripts/serialize_tensorboard.py`) which will load a logdir just
-like TensorBoard does, but write all of the data out to disk as json instead of
-starting a server. This script is setup to make "fake TensorBoard backends" for
-testing, so it is a bit rough around the edges.
+The Scalar Dashboard supports exporting data; you can click the "enable
+download links" option in the left-hand bar. Then, each plot will provide
+download links for the data it contains.
+
+If you need access to the full dataset, you can read the event files that
+TensorBoard consumes by using the [`summary_iterator`](https://github.com/tensorflow/tensorflow/blob/e7f333b5f8b3c53b21d149d8d14c0cebbde431aa/tensorflow/python/summary/summary_iterator.py#L313)
+method.
+
 
 ### Can I overlap multiple plots?
 
diff --git a/tensorflow/tensorboard/backend/application_test.py b/tensorflow/tensorboard/backend/application_test.py
index a5181401fa213b4d52f459d1d9632455a6e49e78..4ea627def7ce5e43e40e2294ec35cccce1ccb280 100644
--- a/tensorflow/tensorboard/backend/application_test.py
+++ b/tensorflow/tensorboard/backend/application_test.py
@@ -227,6 +227,19 @@ class TensorboardServerTest(test.TestCase):
       response.read()
       connection.close()
 
+  def testScalars(self):
+    """Test the format of /data/scalars."""
+    data = self._getJson('/data/scalars?run=run1&tag=simple_values')
+    self.assertEqual(len(data), self._SCALAR_COUNT)
+
+  def testScalarsCsv(self):
+    """Test the csv format of /data/scalars."""
+    data = self._get(
+        '/data/scalars?run=run1&tag=simple_values&format=csv').read()
+    line_count = data.count('\n')
+    self.assertEqual(line_count,
+                     self._SCALAR_COUNT + 1)  # include 1 more line for header
+
   def testHistograms(self):
     """Test the format of /data/histograms."""
     self.assertEqual(
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
index 3734e470b694671047b59b5317c011b55c0d6245..f7d424cb912fcacb32899b97ecbfb2676307e8ef 100644
--- a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
@@ -225,6 +225,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertTagsEqual(x.Tags(), {})
 
   def testTags(self):
+    """Tags should be found in EventAccumulator after adding some events."""
     gen = _EventGenerator(self)
     gen.AddScalar('s1')
     gen.AddScalar('s2')
@@ -245,6 +246,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     })
 
   def testReload(self):
+    """EventAccumulator contains suitable tags after calling Reload."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     acc.Reload()
@@ -267,6 +269,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     })
 
   def testScalars(self):
+    """Tests whether EventAccumulator contains scalars after adding them."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     s1 = ea.ScalarEvent(wall_time=1, step=10, value=32)
@@ -293,6 +296,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
       self.assertEqual(expected_value, gotten_event.value[i])
 
   def testHealthPills(self):
+    """HealthPills should be properly inserted into EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     gen.AddHealthPill(13371337, 41, 'Add', 0, range(1, 13))
@@ -328,6 +332,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertItemsEqual(['Add', 'MatMul'], acc.GetOpsWithHealthPills())
 
   def testHistograms(self):
+    """Tests whether histograms are inserted into EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
 
@@ -377,6 +382,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertEqual(acc.Histograms('hst2'), [hst2])
 
   def testCompressedHistograms(self):
+    """Tests compressed histograms inserted into EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
 
@@ -428,6 +434,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertEqual(acc.CompressedHistograms('hst2'), [expected_cmphst2])
 
   def testCompressedHistogramsWithEmptyHistogram(self):
+    """Tests that empty histograms compressed properly in EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
 
@@ -481,6 +488,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertAlmostEqual(vals[8].value, 1.0)
 
   def testImages(self):
+    """Tests 2 images inserted/accessed in EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     im1 = ea.ImageEvent(
@@ -514,6 +522,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertEqual(acc.Images('im2'), [im2])
 
   def testAudio(self):
+    """Tests 2 audio events inserted/accessed in EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     snd1 = ea.AudioEvent(
@@ -551,6 +560,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertEqual(acc.Audio('snd2'), [snd2])
 
   def testKeyError(self):
+    """KeyError should be raised when accessing non-existing keys."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     acc.Reload()
@@ -574,7 +584,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
       acc.Audio('hst1')
 
   def testNonValueEvents(self):
-    """Tests that non-value events in the generator don't cause early exits."""
+    """Non-value events in the generator don't cause early exits."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     gen.AddScalar('s1', wall_time=1, step=10, value=20)
diff --git a/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py b/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
index ded1856d7e3718535f991a65e416f61a03397ad8..a97f39e87f85f04123f5f2ddddab7af84b960a52 100644
--- a/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
@@ -124,16 +124,19 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     self.stubs.CleanUp()
 
   def testEmptyLoader(self):
+    """Tests empty EventMultiplexer creation."""
     x = event_multiplexer.EventMultiplexer()
     self.assertEqual(x.Runs(), {})
 
   def testRunNamesRespected(self):
+    """Tests two EventAccumulators inserted/accessed in EventMultiplexer."""
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
     self.assertItemsEqual(sorted(x.Runs().keys()), ['run1', 'run2'])
     self.assertEqual(x._GetAccumulator('run1')._path, 'path1')
     self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
 
   def testReload(self):
+    """EventAccumulators should Reload after EventMultiplexer call it."""
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
     self.assertFalse(x._GetAccumulator('run1').reload_called)
     self.assertFalse(x._GetAccumulator('run2').reload_called)
@@ -142,6 +145,7 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     self.assertTrue(x._GetAccumulator('run2').reload_called)
 
   def testScalars(self):
+    """Tests Scalars function returns suitable values."""
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
 
     run1_actual = x.Scalars('run1', 'sv1')
@@ -150,6 +154,7 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     self.assertEqual(run1_expected, run1_actual)
 
   def testHealthPills(self):
+    """Tests HealthPills() returns events associated with run1/Add."""
     self.stubs.Set(event_accumulator, 'EventAccumulator',
                    functools.partial(
                        _GetFakeAccumulator,
@@ -172,11 +177,13 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(['Add'], x.GetOpsWithHealthPills('run1'))
 
   def testExceptions(self):
+    """KeyError should be raised when accessing non-existing keys."""
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
     with self.assertRaises(KeyError):
       x.Scalars('sv1', 'xxx')
 
   def testInitialization(self):
+    """Tests EventMultiplexer is created properly with its params."""
     x = event_multiplexer.EventMultiplexer()
     self.assertEqual(x.Runs(), {})
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
@@ -185,6 +192,14 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
 
   def testAddRunsFromDirectory(self):
+    """Tests AddRunsFromDirectory function.
+
+    Tests the following scenarios:
+    - When the directory does not exist.
+    - When the directory is empty.
+    - When the directory has empty subdirectory.
+    - Contains proper EventAccumulators after adding events.
+    """
     x = event_multiplexer.EventMultiplexer()
     tmpdir = self.get_temp_dir()
     join = os.path.join
diff --git a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
index b9b951fd127875041a6a671ec81c918d83cc7321..34bfd34195fc9185e27dc1a524c99c2773c068da 100644
--- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
+++ b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
@@ -30,6 +30,11 @@ def _IsDirectory(parent, item):
   return gfile.IsDirectory(os.path.join(parent, item))
 
 
+def PluginDirectory(logdir, plugin_name):
+  """Returns the plugin directory for plugin_name."""
+  return os.path.join(logdir, _PLUGINS_DIR, plugin_name)
+
+
 def ListPlugins(logdir):
   """List all the plugins that have registered assets in logdir.
 
@@ -61,7 +66,7 @@ def ListAssets(logdir, plugin_name):
     not exist (either because the logdir doesn't exist, or because the plugin
     didn't register) an empty list is returned.
   """
-  plugin_dir = os.path.join(logdir, _PLUGINS_DIR, plugin_name)
+  plugin_dir = PluginDirectory(logdir, plugin_name)
   if not gfile.IsDirectory(plugin_dir):
     return []
   entries = gfile.ListDirectory(plugin_dir)
@@ -83,7 +88,7 @@ def RetrieveAsset(logdir, plugin_name, asset_name):
     KeyError: if the asset does not exist.
   """
 
-  asset_path = os.path.join(logdir, _PLUGINS_DIR, plugin_name, asset_name)
+  asset_path = os.path.join(PluginDirectory(logdir, plugin_name), asset_name)
   try:
     with gfile.Open(asset_path, "r") as f:
       return f.read()
diff --git a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
index e74a0642d6efecfc31e92ca1ca0c473ba1bae6aa..cfc6857777c9d48ff98e87a4503f7736034b71e4 100644
--- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
@@ -50,6 +50,11 @@ class PluginGamma(GenericContentPlugin):
 
 class PluginAssetUtilitiesTest(test.TestCase):
 
+  def testGetPluginDirectory(self):
+    self.assertEqual(
+        os.path.join("logdir", "plugins", "x"),
+        plugin_asset_util.PluginDirectory("logdir", "x"))
+
   def testNonExistentDirectory(self):
     tempdir = self.get_temp_dir()
     fake_dir = os.path.join(tempdir, "nonexistent_dir")
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9172ebb22ae99815c084462f05a1ff5aacb95ef0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD
@@ -0,0 +1,63 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_audio_dashboard",
+    srcs = [
+        "tf-audio-dashboard.html",
+        "tf-audio-grid.html",
+        "tf-audio-loader.html",
+    ],
+    path = "/tf-audio-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "@org_polymer",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-audio-dashboard.html",
+        "tf-audio-grid.html",
+        "tf-audio-loader.html",
+    ],
+    destdir = "tf-audio-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//tensorflow/tensorboard/components/tf_backend:legacy",
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+        "//third_party/javascript/polymer/v1/paper-styles:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+# This is needed: components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = ["//tensorflow/tensorboard/components:common_deps"],
+)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..394d7a8fa53a8e3b27d2a2d4f20703a3ee1981ae
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_audio_dashboard/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-audio-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_audio_dashboard",
+        "//tensorflow/tensorboard/components/tf_audio_dashboard/demo/data",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4c2c2ee9e526edddc2eaa4af00f0fe90ad6b1380
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "data",
+    srcs = glob(["*"]),
+    path = "/tf-audio-dashboard/demo/data",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
index ed3b5efa07e0a27d1078d4f35aba9b0445a1daaa..71539537d0e55efcc6c1e07ed76f79ec5699ecf4 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
@@ -107,6 +107,8 @@ future for loading older clips.
     </template>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-audio-loader",
       properties: {
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c73207475461f06843392b6e3545434013c564a7
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/BUILD
@@ -0,0 +1,79 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_audio_dashboard_d3v4",
+    srcs = [
+        "tf-audio-dashboard.html",
+        "tf-audio-grid.html",
+        "tf-audio-loader.html",
+    ],
+    path = "/tf-audio-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "@org_polymer",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-audio-dashboard/demo",
+    deps = [
+        ":tf_audio_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+web_library(
+    name = "test",
+    testonly = 1,
+    srcs = [
+        "audioDashboardTests.js",
+        "tests.html",
+    ] + glob(["data/**"]),
+    path = "/tf-audio-dashboard/test",
+    deps = [
+        ":tf_audio_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    testonly = 1,
+    srcs = ["audioDashboardTests.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "@org_definitelytyped//:sinon.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4:bundle.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/audioDashboardTests.ts b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/audioDashboardTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..f2bf68eb8de5fa9bf66e96e4d710f959aaaecaff
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/audioDashboardTests.ts
@@ -0,0 +1,47 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TODO(dandelion): Fix me.
+declare function fixture(id: string): any;
+declare function stub(x, y: any): void;
+
+describe(
+    'audio dashboard tests', function() {
+      var audioDash;
+      var reloadCount = 0;
+      beforeEach(function() {
+        audioDash = fixture('testElementFixture');
+        var router = TF.Backend.router('data', true);
+        var backend = new TF.Backend.Backend(router);
+        audioDash.backend = backend;
+        stub('tf-audio-loader', {
+          reload: function() { reloadCount++; },
+        });
+      });
+
+      it('calling reload on dashboard reloads the audio-loaders',
+         function(done) {
+           audioDash.backendReload().then(() => {
+             reloadCount = 0;
+             var loaders = [].slice.call(
+                 audioDash.getElementsByTagName('tf-audio-loader'));
+             audioDash.frontendReload();
+             setTimeout(function() {
+               chai.assert.isAbove(reloadCount, 3);
+               done();
+             });
+           });
+         });
+    });
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run1_tag_au1_2Faudio_2F0.json b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run1_tag_au1_2Faudio_2F0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dfe32c7112c61bcacf896de2d906bc06a9c952f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run1_tag_au1_2Faudio_2F0.json
@@ -0,0 +1 @@
+[{"query": "index=0&tag=au1%2Faudio%2F0&run=run1", "step": 0, "wall_time": 1461795049.203407, "content_type": "audio/wav"}]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run2_tag_au2_2Faudio_2F0.json b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run2_tag_au2_2Faudio_2F0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d29f7931a91a23d9933cc952e48e171f328f76d3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run2_tag_au2_2Faudio_2F0.json
@@ -0,0 +1,20 @@
+[
+  {
+    "query":"index=0&tag=au2%2Faudio%2F0&run=run2",
+    "step":0,
+    "wall_time":1461795049.212815,
+    "content_type":"audio/wav"
+  },
+  {
+    "query":"index=1&tag=au2%2Faudio%2F0&run=run2",
+    "step":42,
+    "wall_time":1461895689.243345,
+    "content_type":"audio/wav"
+  },
+  {
+    "query":"index=2&tag=au2%2Faudio%2F0&run=run2",
+    "step":4242,
+    "wall_time":1461954231.123456,
+    "content_type":"audio/wav"
+  }
+]
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav
new file mode 100644
index 0000000000000000000000000000000000000000..f1d24adc0cef5a734e07e8899b9abf8ae26fa228
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav differ
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..006c84338f7313a225830f121bcd95f457de1708
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav differ
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_1_tag_au2_2Faudio_2F0_run_run2.wav b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_1_tag_au2_2Faudio_2F0_run_run2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..f27904691f793eb5348b82c4a557991e44c9ab90
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_1_tag_au2_2Faudio_2F0_run_run2.wav differ
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_2_tag_au2_2Faudio_2F0_run_run2.wav b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_2_tag_au2_2Faudio_2F0_run_run2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..bbd3ff8bcbb60f786f522b4fbdcf92bff4955106
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_2_tag_au2_2Faudio_2F0_run_run2.wav differ
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/runs.json b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/runs.json
new file mode 100644
index 0000000000000000000000000000000000000000..811a873684aa57f34d84a8c2e373afeb9436e4e7
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/runs.json
@@ -0,0 +1,10 @@
+{
+  "run1":
+    {
+     "audio": ["au1/audio/0"]
+    },
+  "run2":
+    {
+      "audio": ["au2/audio/0"]
+    }
+}
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/index.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..8e0587084df2fef61a23848810b7efc246958894
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/index.html
@@ -0,0 +1,61 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../../paper-styles/typography.html">
+<link rel="import" href="../tf-audio-dashboard.html">
+
+<title>Audio Dashboard Demo</title>
+<style>
+  #container {
+    height: 300px;
+    width: 100%;
+  }
+
+  html, body {
+    margin: 0;
+    padding: 0;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="audio-dash-demo">
+      <template>
+        <tf-audio-dashboard id="demo" backend="[[backend]]"></tf-audio-dashboard>
+      </template>
+      <script>
+        Polymer({
+          is: "audio-dash-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                var router = new TF.Backend.router("data", true);
+                return new TF.Backend.Backend(router);
+              },
+            },
+          },
+        });
+      </script>
+    </dom-module>
+    <audio-dash-demo id="container"></audio-dash-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tests.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tests.html
new file mode 100644
index 0000000000000000000000000000000000000000..891e8bf0c29f5cca7a4654b49dde81997c6d27d5
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tests.html
@@ -0,0 +1,38 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<script src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../../tf-imports/d3.html">
+<link rel="import" href="../tf-audio-dashboard.html">
+<style>
+  html, body {
+    margin: 0;
+    padding: 0;
+    height: 100%;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+</style>
+
+<test-fixture id="testElementFixture">
+  <template>
+    <tf-audio-dashboard></tf-audio-dashboard>
+  </template>
+</test-fixture>
+
+<script src="audioDashboardTests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-dashboard.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-dashboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..0353c51628d934b4490e2657670286d4e00beb55
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-dashboard.html
@@ -0,0 +1,87 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="tf-audio-grid.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+
+<!--
+tf-audio-dashboard displays a dashboard that loads audio from a TensorFlow run.
+
+@element tf-audio-dashboard
+@demo demo/index.html
+-->
+<dom-module id="tf-audio-dashboard">
+  <template>
+    <div class="center">
+      <tf-no-data-warning
+        data-type="audio"
+        show-warning="[[dataNotFound]]"
+      ></tf-no-data-warning>
+      <tf-audio-grid
+        id="audioGrid"
+        run-to-audio="[[run2tag]]"
+        audio-generator="[[dataProvider]]"
+        tags="[[tags]]"
+        runs="[[runs]]"
+      ></tf-audio-grid>
+    </div>
+
+    <style>
+      .center {
+        height: 100%;
+        width: 100%;
+        -webkit-box-sizing: border-box;
+        -moz-box-sizing: border-box;
+        box-sizing: border-box;
+      }
+      :host {
+        height: 100%;
+        display: block;
+      }
+
+    </style>
+  </template>
+  <script>
+    TF.Dashboard.TfAudioDashboard = Polymer({
+      is: "tf-audio-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
+      properties: {
+        dataType: {value: "audio"},
+      },
+      behaviors: [
+        TF.Dashboard.DashboardBehavior("audio"),
+        TF.Dashboard.ReloadBehavior("tf-audio-loader"),
+        TF.Backend.BackendBehavior
+      ],
+      attached: function() {
+        this.async(function() {
+          this.fire("rendered");
+        });
+      },
+      _hasAudio: function(runToAudioChange) {
+        return _.values(runToAudioChange.base).some(function(arr) {
+          return arr.length > 0;
+        });
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-grid.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-grid.html
new file mode 100644
index 0000000000000000000000000000000000000000..c71d8bdd4bf918ad3877d6ecae8394d131f007f8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-grid.html
@@ -0,0 +1,183 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-styles/paper-styles.html">
+<link rel="import" href="tf-audio-loader.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../tf-dashboard-common/scrollbar-style.html">
+
+<!--
+tf-audio-grid creates a grid for examining audio data. The columns correspond
+to runs and the rows correspond to tags. Each cell is an audio clip.
+
+Structurally, it makes extensive use of flexbox for layout: it has a top-level
+columnar flexbox that contains the topRow (run names) and then a
+bottomContainer. The bottomContainer is another columnar flexbox which contains
+repeated audio-rows. Each audio-row is a row flexbox which contains a tag name
+cell, and then audio cells.
+
+In the future, we should improve on the layout by making the tag names and run names have fixed positions
+within the audio-grid, so that when you scroll you always have context (e.g. row and column names in a spreadsheet).
+For now, it just scrolls.
+
+The audio grid provides internal scroll bars (with styling) so that it can be dropped into
+a dashboard in a predictable fashion, even though the internal audio grid may be enormous.
+
+Room for future improvement:
+
+- Make it obvious when an audio didn't load due to the audio not existing.
+- Find some way to collapse sparse audio grids into denser ones (when sparsity
+is high)
+- Fix column/row names
+- Include hook for loading past audio (by step/timestamp? or index?)
+
+@element tf-audio-grid
+-->
+<dom-module id="tf-audio-grid">
+  <template>
+    <style include="scrollbar-style"></style>
+    <div id="fullContainer" class="container scrollbar">
+      <div id="topRow" class="container">
+        <div class="noshrink" id="paddingCell"></div>
+        <template is="dom-if" if="[[_tagsExist(tags)]]">
+          <template
+            is="dom-repeat"
+            items="[[runs]]"
+            as="run"
+          >
+            <div class="run-name-cell noshrink">
+              <span>[[run]]</span>
+            </div>
+          </template>
+        </template>
+      </div>
+      <div id="bottomContainer" class="container">
+        <template
+          is="dom-repeat"
+          items="[[tags]]"
+          as="tag"
+        >
+          <div class="audio-row container noshrink">
+            <div class="tag-name-cell noshrink">
+              <span class="tag-name">[[tag]]</span>
+            </div>
+            <template
+              is="dom-repeat"
+              items="[[runs]]"
+              as="run"
+            >
+              <div class="audio-cell noshrink">
+                <template is="dom-if" if="[[_exists(run, tag, runToAudio.*)]]">
+                  <tf-audio-loader
+                    id="loader"
+                    run="[[run]]"
+                    tag="[[tag]]"
+                    audio-generator="[[audioGenerator]]"
+                  >
+                  </tf-audio-loader>
+                </template>
+              </div>
+            </template>
+          </div>
+        </template>
+      </div>
+    </div>
+    <style>
+      :host {
+        display: block;
+        height: 100%;
+        --audio-cell-min-height: 105px;
+      }
+      .container {
+        display: flex;
+        flex-wrap: nowrap;
+      }
+      #fullContainer {
+        width: 100%;
+        height: 100%;
+        flex-direction: column;
+        padding-top: 20px;
+        overflow: auto;
+        -webkit-box-sizing: border-box;
+        -moz-box-sizing: border-box;
+        box-sizing: border-box;
+      }
+      #topRow {
+        flex-direction: row;
+      }
+      #bottomContainer {
+        flex-direction: column;
+        height: 100%;
+        width: 100%;
+      }
+      .audio-row {
+        flex-direction: row;
+        padding-top: 5px;
+      }
+      .audio-cell {
+        background: #FAFAFA;
+        width: 300px;
+        min-height: var(--audio-cell-min-height);
+        border: 1px solid black;
+        margin-right: 3px;
+        padding: 10px;
+        box-sizing: border-box;
+      }
+      .tag-name-cell {
+        width: 300px;
+        height: var(--audio-cell-min-height);
+        display:flex;
+        flex-direction: column;
+        justify-content: center;
+      }
+      .tag-name {
+        word-wrap: break-word;
+        text-align: center;
+        white-space: nowrap;
+      }
+      .run-name-cell {
+        width: 300px;
+        text-align: center;
+        margin-right: 5px;
+      }
+      .noshrink {
+        flex-shrink: 0;
+      }
+      #paddingCell {
+        width: 300px;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-audio-grid",
+      properties: {
+        runToAudio: Object,
+        tags: Array,
+        runs: Array,
+        audioGenerator: Function,
+      },
+      _tagsExist: function(tags) {
+        return tags && tags.length > 0;
+      },
+      _exists: function (run, tag) {
+        return this.runToAudio[run].indexOf(tag) !== -1;
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-loader.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-loader.html
new file mode 100644
index 0000000000000000000000000000000000000000..71539537d0e55efcc6c1e07ed76f79ec5699ecf4
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-loader.html
@@ -0,0 +1,237 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-slider/paper-slider.html">
+<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="../tf-imports/lodash.html">
+
+<!--
+tf-audio-loader loads an individual audio clip from the TensorBoard backend.
+
+Right now it always loads the most recent audio clip. We should add support in the
+future for loading older clips.
+
+@element tf-audio-loader
+-->
+<dom-module id="tf-audio-loader">
+  <style>
+    :host {
+      display: block;
+      --step-slider-knob-color: #424242;
+    }
+
+    img {
+      width: 100%;
+      height: 100%;
+      image-rendering: pixelated;
+    }
+
+    .step-description {
+      font-size: 12px;
+    }
+
+    .step-value {
+      font-weight: bold;
+    }
+
+    #audio-loading-spinner {
+      width: 14px;
+      height: 14px;
+      vertical-align: text-bottom;
+      --paper-spinner-color: var(--tb-orange-strong)
+    }
+
+    #steps {
+      height: 15px;
+      margin: 0 0 0 -15px;
+      /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
+       * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
+       * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2. */
+      width: calc(100% + 31px);
+      --paper-slider-active-color: var(--step-slider-knob-color);
+      --paper-slider-knob-color: var(--step-slider-knob-color);
+      --paper-slider-pin-color: var(--step-slider-knob-color);
+      --paper-slider-knob-start-color: var(--step-slider-knob-color);
+      --paper-slider-knob-start-border-color: var(--step-slider-knob-color);
+      --paper-slider-pin-start-color: var(--step-slider-knob-color);
+    }
+
+    #individual-audio-container audio {
+      margin: 5px 0 0 -10px;
+      width: calc(100% + 20px);
+    }
+  </style>
+  <template>
+    <template is="dom-if" if="[[_metadatas]]">
+      <template is="dom-if" if="[[_hasAtLeastOneStep(_metadatas)]]">
+        <div class="step-description">
+          step
+          <span class="step-value">
+            [[_stepValue]]
+          </span><br>
+          <template is="dom-if" if="[[_stepWallTime]]">
+            [[_stepWallTime]]
+          </template>
+          <paper-spinner-lite active
+                              id="audio-loading-spinner"
+                              hidden$=[[!_isAudioLoading]]></paper-spinner-lite>
+        </div>
+      </template>
+      <template is="dom-if" if="[[_maxStepIndex]]">
+        <paper-slider
+            id="steps"
+            immediate-value="{{_stepIndex}}"
+            max="[[_maxStepIndex]]"
+            max-markers="[[_maxStepIndex]]"
+            snaps
+            step="1"
+            value="{{_stepIndex}}"></paper-slider>
+      </template>
+      <div id="individual-audio-container"></div>
+    </template>
+  </template>
+  <script>
+    "use strict";
+
+    Polymer({
+      is: "tf-audio-loader",
+      properties: {
+        run: String,
+        tag: String,
+        audioGenerator: Function,
+        // todo: document.
+        _metadatas: Array,
+        _stepIndex: Number,
+        _stepValue: {
+          type: Number,
+          computed: "_computeStepValue(_metadatas, _stepIndex)",
+          value: 0,
+        },
+        _stepWallTime: {
+          type: Number,
+          computed: "_computeStepWallTime(_metadatas, _stepIndex)",
+          value: 0,
+        },
+        _maxStepIndex: {
+          type: Number,
+          computed: "_computeMaxStepIndex(_metadatas)",
+          value: 0,
+        },
+        _isAudioLoading: Boolean,
+        // Used to identify stale requests for audio.
+        _audioRequestId: {
+          type: Number,
+          value: 1
+        },
+      },
+      observers: [
+        "_updateAudio(_metadatas, _stepIndex)",
+      ],
+      reload: function() {
+        this.audioGenerator(this.tag, this.run).then(function(metadatas) {
+          // Set the list of available metadata.
+          this.set("_metadatas", metadatas);
+
+          // Set the index to be the last one.
+          this.set("_stepIndex", this._maxStepIndex);
+        }.bind(this));
+      },
+      ready: function() {
+        // Need to test so that it will not error if it is constructed w/o
+        // all properties (so that it's possible to use stub to mock it out)
+        if (this.run != null && this.tag != null && this.audioGenerator != null) {
+          this.reload();
+        }
+      },
+      _updateAudio: function(metadatas, stepIndex) {
+        if (!metadatas || stepIndex >= metadatas.length) {
+          // No audio to show. The audio section should be hidden.
+          return;
+        }
+
+        // Load new audio.
+        const requestId = ++this._audioRequestId;
+        this.set("_isAudioLoading", true);
+
+        // Create a new audio element. Only replace the previous one once the new audio loads.
+        let audioElement = document.createElement("audio");
+        audioElement.setAttribute("controls", true);
+        audioElement.setAttribute("loop", "loop");
+        let canPlayHandler = function() {
+          if (requestId !== this._audioRequestId) {
+            // This request is no longer relevant.
+            return;
+          }
+
+          // Remove this event listener: "canplay" apparently fires in Chrome every time playing
+          // begins again on loop. So, if we create a new audio element every time that happens, we
+          // don't actually loop.
+          audioElement.removeEventListener("canplay", canPlayHandler);
+
+          let individualAudioContainer = this.$$("#individual-audio-container");
+          individualAudioContainer.innerHTML = "";
+          Polymer.dom(individualAudioContainer).appendChild(audioElement);
+          this.set("_isAudioLoading", false);
+        }.bind(this);
+        audioElement.addEventListener("canplay", canPlayHandler);
+        audioElement.addEventListener("error", function() {
+          if (requestId !== this._audioRequestId) {
+            // This request is no longer relevant.
+            return;
+          }
+
+          // The audio could not be loaded.
+          this.$$("#individual-audio-container").innerHTML = "";
+          this.set("_isAudioLoading", false);
+        }.bind(this));
+
+        // Initiate the request for new audio.
+        var sourceElement = document.createElement("source");
+        let metadata = metadatas[stepIndex];
+        sourceElement.setAttribute("src", metadata.url);
+        sourceElement.setAttribute("type", metadata.content_type);
+        audioElement.appendChild(sourceElement);
+      },
+      _computeStepValue: function(metadatas, stepIndex) {
+        if (!metadatas || stepIndex >= metadatas.length) {
+          // No audio to show. The audio section should be hidden.
+          return 0;
+        }
+        return metadatas[stepIndex].step;
+      },
+      _computeStepWallTime: function(metadatas, stepIndex) {
+        if (!metadatas || stepIndex >= metadatas.length) {
+          // No audio to show. The audio section should be hidden.
+          return 0;
+        }
+        return metadatas[stepIndex].wall_time.toString();
+      },
+      _computeMaxStepIndex: function(metadatas) {
+        if (!metadatas || metadatas.length === 0) {
+          // No audio to show. The audio section should be hidden.
+          return 0;
+        }
+        return metadatas.length - 1;
+      },
+      _hasAtLeastOneStep: function(metadatas) {
+        return metadatas && metadatas.length > 0;
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_backend/BUILD b/tensorflow/tensorboard/components/tf_backend/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..d9f3a035d863bc1ad5d6c716cbea574b931a51c9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend/BUILD
@@ -0,0 +1,81 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# TODO(dandelion): Add webfiles support for the test code.
+
+web_library(
+    name = "tf_backend",
+    srcs = [
+        "tf-backend.html",
+        ":ts",
+    ],
+    path = "/tf-backend",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/vz_sorting",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = [
+        "backend.ts",
+        "behavior.ts",
+        "requestManager.ts",
+        "router.ts",
+        "urlPathHelpers.ts",
+    ],
+    typings = [
+        "@org_definitelytyped//:d3.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
+        "//tensorflow/tensorboard/components/vz_sorting:ts_typings",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-backend.html",
+        ":legacy_ts",
+    ],
+    visibility = ["//visibility:public"],
+    destdir = "tf-backend",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [
+        "backend.ts",
+        "behavior.ts",
+        "requestManager.ts",
+        "router.ts",
+        "urlPathHelpers.ts",
+    ],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = [
+        "//tensorflow/tensorboard/components:common_deps",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy_ts",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/tf_backend/backend.ts b/tensorflow/tensorboard/components/tf_backend/backend.ts
index 54d89a6bbb1a3d5cb430163700f3aa3e9e720e07..befa6991ddd2820ce8b2970519f2bb8847856476 100644
--- a/tensorflow/tensorboard/components/tf_backend/backend.ts
+++ b/tensorflow/tensorboard/components/tf_backend/backend.ts
@@ -226,7 +226,12 @@ module TF.Backend {
      */
     public healthPills(nodeNames: string[], step?: number):
         Promise<HealthPillsResponse> {
-      let postData = {'node_names': JSON.stringify(nodeNames)};
+      const postData = {
+        'node_names': JSON.stringify(nodeNames),
+
+        // Events files with debugger data fall under this special run.
+        'run': '__debugger_data__',
+      };
       if (step !== undefined) {
         // The user requested health pills for a specific step. This request
         // might be slow since the backend reads events sequentially from disk.
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/BUILD b/tensorflow/tensorboard/components/tf_backend_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..019283816feadc837c608ffcb29ac368afcfd5cc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/BUILD
@@ -0,0 +1,57 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_backend_d3v4",
+    srcs = [
+        "bundle.js",
+        "tf-backend.html",
+    ],
+    path = "/tf-backend",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.Backend": [
+        "requestManager.ts",
+        "backend.ts",
+        "behavior.ts",
+        "urlPathHelpers.ts",
+        "router.ts",
+    ]},
+    namespace_symbol_aliases = {"TF.Backend": {
+        "compareTagNames": "VZ.Sorting.compareTagNames",
+    }},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/backend.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/backend.ts
index 2f3b561d874f5de83c75e26fc313940103eccce7..2e1282394bfebe6e8f80fb53777dda52076d8f1d 100644
--- a/tensorflow/tensorboard/components/tf_backend_d3v4/backend.ts
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/backend.ts
@@ -236,7 +236,12 @@ export class Backend {
    */
   public healthPills(nodeNames: string[], step?: number):
       Promise<HealthPillsResponse> {
-    const postData = {'node_names': JSON.stringify(nodeNames)};
+    const postData = {
+      'node_names': JSON.stringify(nodeNames),
+
+      // Events files with debugger data fall under this special run.
+      'run': '__debugger_data__',
+    };
     if (step !== undefined) {
       // The user requested health pills for a specific step. This request
       // might be slow since the backend reads events sequentially from disk.
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/test/BUILD b/tensorflow/tensorboard/components/tf_backend_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..00b2be45eb4026ae54cc777dc43f3c1dea4c5361
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/BUILD
@@ -0,0 +1,56 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ] + glob(["data/**"]),
+    path = "/tf-backend/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4:bundle.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.Backend": [
+        "backendTests.ts",
+        "behaviorTests.ts",
+        "requestManagerTests.ts",
+    ]},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/backendTests.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/test/backendTests.ts
similarity index 97%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/backendTests.ts
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/backendTests.ts
index 180c1f529873170a7160e7f1ac0f584b9c70b987..648d175621ea337a559b7e22a0595086d9b6b50c 100644
--- a/tensorflow/tensorboard/components/tf_backend_d3v4/backendTests.ts
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/backendTests.ts
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-import {Backend, convertBins, filterTags, getRuns, getTags, RunToTag, TYPES} from './backend';
-import {RequestManager} from './requestManager';
-import {Router, router} from './router';
-import {BAD_CHARACTERS, demoify, queryEncoder} from './urlPathHelpers';
+import {Backend, convertBins, filterTags, getRuns, getTags, RunToTag, TYPES} from '../backend';
+import {RequestManager} from '../requestManager';
+import {Router, router} from '../router';
+import {BAD_CHARACTERS, demoify, queryEncoder} from '../urlPathHelpers';
 
 describe('urlPathHelpers', () => {
   it('demoify works as expected', () => {
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/behaviorTests.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/test/behaviorTests.ts
similarity index 97%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/behaviorTests.ts
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/behaviorTests.ts
index 6d55aca5f8529d3ab4e6169a0173e26290259b5a..4a74fe01c1b57deb911df878e9cba96d91ac0283 100644
--- a/tensorflow/tensorboard/components/tf_backend_d3v4/behaviorTests.ts
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/behaviorTests.ts
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {Backend, getRuns, getTags, RunToTag} from './backend'
-import {BackendBehavior} from './behavior'
+import {Backend, getRuns, getTags, RunToTag} from '../backend'
+import {BackendBehavior} from '../behavior'
 
 declare function fixture(id: string): void;
 
@@ -154,6 +154,7 @@ describe('data-behavior', function() {
     });
   });
 
+  // TODO(dandelion): Fix this test.
   it('reload calls frontendReload', function(done) {
     testElement.frontendReload = function() {
       done();
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/audio_run_run1_tag_audio1.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/audio_run_run1_tag_audio1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/audio_run_run1_tag_audio1.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/audio_run_run1_tag_audio1.json
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/compressedHistograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/compressedHistograms_run_run1_tag_histo1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/compressedHistograms_run_run1_tag_histo1.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/compressedHistograms_run_run1_tag_histo1.json
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/example.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/example.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/example.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/example.json
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/histograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/histograms_run_run1_tag_histo1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/histograms_run_run1_tag_histo1.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/histograms_run_run1_tag_histo1.json
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/images_run_run1_tag_im1.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/images_run_run1_tag_im1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/images_run_run1_tag_im1.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/images_run_run1_tag_im1.json
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/individualImage_index_0_tag_im1_run_run1.png b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/individualImage_index_0_tag_im1_run_run1.png
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/individualImage_index_0_tag_im1_run_run1.png
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/individualImage_index_0_tag_im1_run_run1.png
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/run_metadata_run_step99_tag_train.pbtxt b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/run_metadata_run_step99_tag_train.pbtxt
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/run_metadata_run_step99_tag_train.pbtxt
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/run_metadata_run_step99_tag_train.pbtxt
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/runs.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/runs.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/runs.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/runs.json
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/scalars.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/scalars.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/scalars.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/scalars.json
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/data/scalars_run_run1_tag_cross_entropy__281_29.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/scalars_run_run1_tag_cross_entropy__281_29.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/data/scalars_run_run1_tag_cross_entropy__281_29.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/scalars_run_run1_tag_cross_entropy__281_29.json
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/requestManagerTests.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/test/requestManagerTests.ts
similarity index 98%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/requestManagerTests.ts
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/requestManagerTests.ts
index 8bd46744f5a10c8fbfb44b90c818971e07088f6f..23a4e8f6111b115875ec6d38a69d1f454acff7d3 100644
--- a/tensorflow/tensorboard/components/tf_backend_d3v4/requestManagerTests.ts
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/requestManagerTests.ts
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {RequestManager, RequestNetworkError} from './requestManager';
+import {RequestManager, RequestNetworkError} from '../requestManager';
+
 interface MockRequest {
   resolve: Function;
   reject: Function;
   id: number;
   url: string;
 }
+
 class MockedRequestManager extends RequestManager {
   private resolvers: Function[];
   private rejectors: Function[];
@@ -112,7 +114,6 @@ describe('backend', () => {
             done(new Error('the promise should have rejected'));
           },
           (reject: RequestNetworkError) => {
-            chai.assert.instanceOf(reject, RequestNetworkError);
             chai.assert.include(reject.message, '404');
             chai.assert.include(reject.message, badUrl);
             chai.assert.equal(reject.req.status, 404);
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/tests.html b/tensorflow/tensorboard/components/tf_backend_d3v4/test/tests.html
similarity index 64%
rename from tensorflow/tensorboard/components/tf_backend_d3v4/tests.html
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/tests.html
index 7f51861d25a365fcf7e365897c427f931004f1c5..cdc17c2607e9b31ea530d49c75bd24e316b568ad 100644
--- a/tensorflow/tensorboard/components/tf_backend_d3v4/tests.html
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/tests.html
@@ -22,25 +22,14 @@ limitations under the License.
   <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
   <script src="../../web-component-tester/browser.js"></script>
   <link rel="import" href="../../polymer/polymer.html">
-  <link rel="import" href="../../tf-imports/d3.html">
-  <link rel="import" href="../../vz-sorting/vz-sorting.html">
+  <link rel="import" href="../tf-backend.html">
 </head>
 <body>
   <test-fixture id="testElementFixture">
     <template>
-      <test-element
-        id="test"
-      ></test-element>
+      <test-element id="test"></test-element>
     </template>
   </test-fixture>
-    <script src="../requestManager.js"></script>
-    <script src="../urlPathHelpers.js"></script>
-    <script src="../router.js"></script>
-    <script src="../backend.js"></script>
-    <script src="../behavior.js"></script>
-
-    <script src="requestManagerTest.js"></script>
-    <script src="backendTests.js"></script>
-    <script src="behaviorTests.js"></script>
+  <script src="bundle.js"></script>
 </body>
 </html>
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/tf-backend.html b/tensorflow/tensorboard/components/tf_backend_d3v4/tf-backend.html
new file mode 100644
index 0000000000000000000000000000000000000000..5bf266336285719965a7456fc6f894d62820c940
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/tf-backend.html
@@ -0,0 +1,23 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../vz-sorting/vz-sorting.html">
+
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_color_scale/BUILD b/tensorflow/tensorboard/components/tf_color_scale/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3e64fccd2e6093a620f1373c88f7443b57d36a3c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale/BUILD
@@ -0,0 +1,65 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# TODO(dandelion): Add webfiles support for the test code.
+
+web_library(
+    name = "tf_color_scale",
+    srcs = [
+        "tf-color-scale.html",
+        ":ts",
+    ],
+    path = "/tf-color-scale",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = [
+        "colorScale.ts",
+        "palettes.ts",
+    ],
+    typings = ["@org_definitelytyped//:d3.d.ts"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-color-scale.html",
+        ":legacy_ts",
+    ],
+    destdir = "tf-color-scale",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [
+        "colorScale.ts",
+        "palettes.ts",
+    ],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = ["//tensorflow/tensorboard/components:common_deps"],
+)
diff --git a/tensorflow/tensorboard/components/tf_color_scale/demo/BUILD b/tensorflow/tensorboard/components/tf_color_scale/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f0a7fa2235751462aef9d6e33391dbb8347abf29
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_color_scale/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-color-scale/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/BUILD b/tensorflow/tensorboard/components/tf_color_scale_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5feea12fb47803ea30adbd4d32f880a4d035f176
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/BUILD
@@ -0,0 +1,59 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_color_scale_d3v4",
+    srcs = [
+        "bundle.js",
+        "tf-color-scale.html",
+    ],
+    path = "/tf-color-scale",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-color-scale",
+    deps = [
+        ":tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF": [
+        "palettes.ts",
+        "colorScale.ts",
+    ]},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScale.ts b/tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScale.ts
index d10e75344588d1fae04d7e5aaa8bb0587de32c97..ff90d46aa249d240250854b0ec631834286ee651 100644
--- a/tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScale.ts
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScale.ts
@@ -63,8 +63,6 @@ export class ColorScale {
   }
 }
 
-
-
 Polymer({
   is: 'tf-color-scale',
   properties: {
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/demo.html b/tensorflow/tensorboard/components/tf_color_scale_d3v4/index.html
similarity index 90%
rename from tensorflow/tensorboard/components/tf_color_scale_d3v4/demo.html
rename to tensorflow/tensorboard/components/tf_color_scale_d3v4/index.html
index 341ce1845296c23caa97a26d5200907a87e803fb..81dfab098c6d86dfc6b666aa26d0d39f4ad3ae8e 100644
--- a/tensorflow/tensorboard/components/tf_color_scale_d3v4/demo.html
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/index.html
@@ -19,11 +19,12 @@ limitations under the License.
 <meta charset="utf-8">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
 <title>tf-color-scale demo</title>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../paper-button/paper-button.html">
+<link rel="import" href="../paper-styles/typography.html">
+<link rel="import" href="../tf-imports/d3.html">
 <link rel="import" href="tf-color-scale.html">
-<link rel="import" href="iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="paper-styles/typography.html">
-<link rel="import" href="paper-button/paper-button.html">
-<script src="bundle.js"></script>
 
 <style> body {font-family: "Roboto";}</style>
 <demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/BUILD b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ac22ab8218a0fa675aa7cf79a625af8e0292e87c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/BUILD
@@ -0,0 +1,48 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/tf-color-scale/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF": ["colorScaleTests.ts"]},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScaleTests.ts b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/colorScaleTests.ts
similarity index 97%
rename from tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScaleTests.ts
rename to tensorflow/tensorboard/components/tf_color_scale_d3v4/test/colorScaleTests.ts
index aa9c601385ba896c9109866dbf1016559749d16a..78824a772c3e6b68a4d1fa2f63b821b202bba0c8 100644
--- a/tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScaleTests.ts
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/colorScaleTests.ts
@@ -15,7 +15,7 @@ limitations under the License.
 
 let assert = chai.assert;
 
-import {ColorScale} from './colorScale'
+import {ColorScale} from '../colorScale'
 
 describe('ColorScale', function() {
   let ccs: ColorScale;
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/tests.html b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/tests.html
new file mode 100644
index 0000000000000000000000000000000000000000..eccc32cdec5547e1e54c9eb28fd9605ba629323c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/tests.html
@@ -0,0 +1,24 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../web-component-tester/browser.js"></script>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-color-scale.html">
+<body>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/tf-color-scale.html b/tensorflow/tensorboard/components/tf_color_scale_d3v4/tf-color-scale.html
index 7d2cb8bafd39c2fa51d32cd99708f80a99a664fe..3dedfaf1a1c10ca12f9119992d23fc7f67b44546 100644
--- a/tensorflow/tensorboard/components/tf_color_scale_d3v4/tf-color-scale.html
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/tf-color-scale.html
@@ -15,7 +15,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<link rel="import" href="polymer/polymer.html">
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/d3.html">
 
 <!--
 tf-color-scale is a plumbing component that takes in an array of runs, and produces
@@ -25,7 +26,5 @@ a set of colors.
 @element tf-color-scale
 -->
 <dom-module id="tf-color-scale">
-  <script>
-
-  </script>
+  <script src="bundle.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..321e4a821e406a2144068dc270acd6e3dec7d4dc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
@@ -0,0 +1,103 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_dashboard_common",
+    srcs = glob(["*.html"]) + [
+        ":ts",
+    ],
+    path = "/tf-dashboard-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:plottable",
+        "//tensorflow/tensorboard/components/tf_storage",
+        "//tensorflow/tensorboard/components/vz_sorting",
+        "@org_polymer",
+        "@org_polymer_iron_ajax",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+        "@org_polymer_paper_toggle_button",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = [
+        "categorizer.ts",
+        "dashboard-behavior.ts",
+        "reload-behavior.ts",
+    ],
+    typings = [
+        "@org_definitelytyped//:d3.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
+        "//tensorflow/tensorboard/components/vz_sorting:ts_typings",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = glob(["*.html"]) + [":legacy_ts"],
+    destdir = "tf-dashboard-common",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//tensorflow/tensorboard/components/tf_storage:legacy",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy",
+        "//third_party/javascript/polymer/v1/iron-ajax:lib",
+        "//third_party/javascript/polymer/v1/iron-collapse:lib",
+        "//third_party/javascript/polymer/v1/iron-icons:lib",
+        "//third_party/javascript/polymer/v1/paper-button:lib",
+        "//third_party/javascript/polymer/v1/paper-checkbox:lib",
+        "//third_party/javascript/polymer/v1/paper-dialog:lib",
+        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+        "//third_party/javascript/polymer/v1/paper-input:lib",
+        "//third_party/javascript/polymer/v1/paper-item:lib",
+        "//third_party/javascript/polymer/v1/paper-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-slider:lib",
+        "//third_party/javascript/polymer/v1/paper-spinner:lib",
+        "//third_party/javascript/polymer/v1/paper-styles:lib",
+        "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [
+        "categorizer.ts",
+        "dashboard-behavior.ts",
+        "reload-behavior.ts",
+    ],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = [
+        "//tensorflow/tensorboard/components:common_deps",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy_ts",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts b/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
index 42e7cbcff4033843a30f32b11ad5f292e7c7187e..4c06462a981c09170472cfb5d02e23382ae4268a 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
@@ -72,24 +72,31 @@ module Categorizer {
       if (tags.length === 0) {
         return [];
       }
-      let sortedTags = tags.slice().sort(VZ.Sorting.compareTagNames);
-      let categories: Category[] = [];
-      let currentCategory = {
-        name: extractor(sortedTags[0]),
-        tags: [],
-      };
-      sortedTags.forEach((t: string) => {
-        let topLevel = extractor(t);
-        if (currentCategory.name !== topLevel) {
-          categories.push(currentCategory);
-          currentCategory = {
+
+      // Maps between top-level name and category. We use the mapping to avoid
+      // duplicating categories per run.
+      const categoryMapping: {[key: string]: Category} = {};
+
+      tags.forEach((t: string) => {
+        const topLevel = extractor(t);
+        if (!categoryMapping[topLevel]) {
+          const newCategory = {
             name: topLevel,
             tags: [],
           };
+          categoryMapping[topLevel] = newCategory;
         }
-        currentCategory.tags.push(t);
+
+        categoryMapping[topLevel].tags.push(t);
+      });
+
+      // Sort categories into alphabetical order.
+      const categories =
+          _.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
+      _.forEach(categories, (category) => {
+        // Sort the tags within each category.
+        category.tags.sort(VZ.Sorting.compareTagNames);
       });
-      categories.push(currentCategory);
       return categories;
     };
   }
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f89b31b00228c64438ca446639278aa6eb84029c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/demo/BUILD
@@ -0,0 +1,31 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_dashboard_common/demo
+web_library(
+    name = "demo",
+    srcs = [
+        "tf-categorizer-demo.html",
+        "tf-collapsable-pane-demo.html",
+        "tf-multi-checkbox-demo.html",
+        "tf-regex-group-demo.html",
+    ],
+    path = "/tf-dashboard-common/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "@org_polymer_iron_flex_layout",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts b/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
index ea149fda47a57fd5b891df22c5ff1eb15298a6b5..4e52b60f37f088b228ab98869abbcc02f460e11d 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
@@ -62,6 +62,18 @@ module Categorizer {
         assert.deepEqual(
             topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
       });
+
+      it('only create 1 category per run', () => {
+        // TensorBoard separates runs from tags using the / and _ characters
+        // *only* during sorting. The categorizer should group all tags under
+        // their correct categories - and create only 1 category per run.
+        const tags = ['foo/bar', 'foo_in_between_run/baz', 'foo/quux'];
+        const expected = [
+          {name: 'foo', tags: ['foo/bar', 'foo/quux']},
+          {name: 'foo_in_between_run', tags: ['foo_in_between_run/baz']},
+        ];
+        assert.deepEqual(topLevelNamespaceCategorizer(tags), expected);
+      });
     });
 
     describe('customCategorizer', () => {
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
index e2530d597165b8481e6d9b38d03d3e14b5597920..83b141cb98a9a67f480682cf0ddf901fb1fac07a 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
@@ -57,6 +57,8 @@ plugin is requred to implement two functions:
     </style>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-chart-scaffold",
       properties: {
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..564028ba3c6492218785725ac7cdbe1b580fc95b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/BUILD
@@ -0,0 +1,150 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_dashboard_common_d3v4",
+    srcs = [
+        "dashboard-style.html",
+        "run-color-style.html",
+        "scrollbar-style.html",
+        "tensorboard-color.html",
+        "tf-categorizer.html",
+        "tf-categorizer-bundle.js",
+        "tf-chart-scaffold.html",
+        "tf-collapsable-pane.html",
+        "tf-dashboard.html",
+        "tf-dashboard.js",
+        "tf-dashboard-layout.html",
+        "tf-downloader.html",
+        "tf-multi-checkbox.html",
+        "tf-multi-checkbox-bundle.js",
+        "tf-no-data-warning.html",
+        "tf-option-selector.html",
+        "tf-panes-helper.html",
+        "tf-regex-group.html",
+        "tf-regex-group-bundle.js",
+        "tf-run-selector.html",
+        "tf-sidebar-helper.html",
+    ],
+    path = "/tf-dashboard-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_ajax",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+        "@org_polymer_paper_toggle_button",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = [
+        "tf-categorizer-demo.html",
+        "tf-collapsable-pane-demo.html",
+        "tf-multi-checkbox-demo.html",
+        "tf-regex-group-demo.html",
+    ],
+    path = "/tf-dashboard-common",
+    deps = [
+        ":tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "tf_categorizer_bundle",
+    out = "tf-categorizer-bundle.ts",
+    namespace_srcs = {"TF.Dashboard.Categorizer": ["tf-categorizer.ts"]},
+    namespace_symbol_aliases = {"TF.Dashboard.Categorizer": {"compareTagNames": "VZ.Sorting.compareTagNames"}},
+)
+
+tensorboard_typescript_genrule(
+    name = "tf_categorizer_ts",
+    srcs = ["tf-categorizer-bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "tf_regex_group_bundle",
+    out = "tf-regex-group-bundle.ts",
+    namespace_srcs = {"TF.Dashboard.RegexGroup": ["tf-regex-group.ts"]},
+    namespace_symbol_aliases = {"TF.Dashboard.RegexGroup": {"storage": "TF.URIStorage"}},
+)
+
+tensorboard_typescript_genrule(
+    name = "tf_regex_group_ts",
+    srcs = ["tf-regex-group-bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "tf_multi_checkbox_bundle",
+    out = "tf-multi-checkbox-bundle.ts",
+    namespace_srcs = {"TF.Dashboard.MultiCheckbox": ["tf-multi-checkbox.ts"]},
+    namespace_symbol_aliases = {"TF.Dashboard.MultiCheckbox": {"storage": "TF.URIStorage"}},
+)
+
+tensorboard_typescript_genrule(
+    name = "tf_multi_checkbox_ts",
+    srcs = ["tf-multi-checkbox-bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "tf_dashboard_bundle",
+    out = "tf-dashboard.ts",
+    namespace_srcs = {
+        "TF.Dashboard": [
+            "dashboard-behavior.ts",
+            "reload-behavior.ts",
+        ],
+    },
+)
+
+tensorboard_typescript_genrule(
+    name = "tf_dashboard_ts",
+    srcs = ["tf-dashboard.ts"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-behavior.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-behavior.ts
new file mode 100644
index 0000000000000000000000000000000000000000..3e40da14528dffb8abf9529eebb745ecdd575489
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-behavior.ts
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * A behavior that TensorBoard dashboards must implement. This behavior serves
+ * the purpose of an interface.
+ */
+export function DashboardBehavior(dashboardName) {
+  return {
+    properties: {
+      name: {
+        type: String,
+        value: dashboardName,
+        readOnly: true,
+      },
+    },
+    // This method is called when the dashboard reloads, either when the
+    // dashboard is first visited, periodically reloaded, or manually reloaded
+    // via the user clicking the button. Note that dashboard custom elements
+    // that use TF.Dashboard.ReloadBehavior already implement a reload method.
+    reload() {
+      throw Error(
+          'The ' + dashboardName + ' dashboard does not implement reload.');
+    },
+  };
+}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-style.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-style.html
new file mode 100644
index 0000000000000000000000000000000000000000..6629e5bfc2284770da8559145c88e451ae063a77
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-style.html
@@ -0,0 +1,53 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../paper-styles/paper-styles.html">
+<link rel="import" href="tensorboard-color.html">
+
+<dom-module id="dashboard-style">
+  <template>
+    <style>
+      .sidebar {
+        display: flex;
+        flex-direction: column;
+        height: 100%;
+        margin-right: 20px;
+      }
+
+      .sidebar-section {
+        border-top: solid 1px rgba(0, 0, 0, 0.12);
+        padding: 15px 0px 15px 30px;
+      }
+
+      .sidebar-section:first-child {
+        border: none;
+      }
+
+      .sidebar-section:last-child {
+        flex-grow: 1;
+        display: flex;
+      }
+
+      paper-checkbox {
+        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
+        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
+        font-size: 14px;
+        margin-top: 5px;
+      }
+    </style>
+  </template>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/reload-behavior.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/reload-behavior.ts
new file mode 100644
index 0000000000000000000000000000000000000000..8b5ca120d609e26dea8dec57eede05ff39c518d0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/reload-behavior.ts
@@ -0,0 +1,39 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * ReloadBehavior: A simple behavior for dashboards where the
+ * frontendReload() function should find every child element with a
+ * given tag name (e.g. "tf-line-chart" or "tf-image-loader")
+ * and call a `reload` method on that child.
+ * May later extend it so it has more sophisticated logic, e.g. reloading
+ * only tags that are in view.
+ */
+export function ReloadBehavior(tagName) {
+  return {
+    properties: {
+      reloadTag: {
+        type: String,
+        value: tagName,
+      },
+    },
+    frontendReload: function() {
+      var elements = this.getElementsByTagName(this.reloadTag);
+      Array.prototype.forEach.call(elements, function(x) {
+        x.reload();
+      });
+    },
+  };
+}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/run-color-style.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/run-color-style.html
new file mode 100644
index 0000000000000000000000000000000000000000..b15861694f57c1d801fe6d2c4cf3e5cb2410a611
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/run-color-style.html
@@ -0,0 +1,79 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../paper-styles/paper-styles.html">
+
+<dom-module id="run-color-style">
+  <template>
+    <style>
+    [color-class="light-blue"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-light-blue-500);
+      --paper-checkbox-checked-ink-color: var(--paper-light-blue-500);
+      --paper-checkbox-unchecked-color: var(--paper-light-blue-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-light-blue-900);
+    }
+    [color-class="red"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-red-500);
+      --paper-checkbox-checked-ink-color: var(--paper-red-500);
+      --paper-checkbox-unchecked-color: var(--paper-red-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-red-900);
+    }
+    [color-class="green"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-green-500);
+      --paper-checkbox-checked-ink-color: var(--paper-green-500);
+      --paper-checkbox-unchecked-color: var(--paper-green-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-green-900);
+    }
+    [color-class="purple"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-purple-500);
+      --paper-checkbox-checked-ink-color: var(--paper-purple-500);
+      --paper-checkbox-unchecked-color: var(--paper-purple-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-purple-900);
+    }
+    [color-class="teal"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-teal-500);
+      --paper-checkbox-checked-ink-color: var(--paper-teal-500);
+      --paper-checkbox-unchecked-color: var(--paper-teal-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-teal-900);
+    }
+    [color-class="pink"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-pink-500);
+      --paper-checkbox-checked-ink-color: var(--paper-pink-500);
+      --paper-checkbox-unchecked-color: var(--paper-pink-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-pink-900);
+    }
+    [color-class="orange"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-orange-500);
+      --paper-checkbox-checked-ink-color: var(--paper-orange-500);
+      --paper-checkbox-unchecked-color: var(--paper-orange-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-orange-900);
+    }
+    [color-class="brown"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-brown-500);
+      --paper-checkbox-checked-ink-color: var(--paper-brown-500);
+      --paper-checkbox-unchecked-color: var(--paper-brown-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-brown-900);
+    }
+    [color-class="indigo"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-indigo-500);
+      --paper-checkbox-checked-ink-color: var(--paper-indigo-500);
+      --paper-checkbox-unchecked-color: var(--paper-indigo-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-indigo-900);
+    }
+    </style>
+  </template>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/scrollbar-style.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/scrollbar-style.html
new file mode 100644
index 0000000000000000000000000000000000000000..bfd61f66191df29521ecb3958f3bc9cccd57821e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/scrollbar-style.html
@@ -0,0 +1,46 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-styles/paper-styles.html">
+
+<dom-module id="scrollbar-style">
+  <template>
+    <style>
+      .scrollbar::-webkit-scrollbar-track
+      {
+        visibility: hidden;
+      }
+
+      .scrollbar::-webkit-scrollbar
+      {
+        width: 10px;
+      }
+
+      .scrollbar::-webkit-scrollbar-thumb
+      {
+        border-radius: 10px;
+        -webkit-box-shadow: inset 0 0 2px rgba(0,0,0,.3);
+        background-color: var(--paper-grey-500);
+        color: var(--paper-grey-900);
+      }
+      .scrollbar {
+        box-sizing: border-box;
+      }
+    </style>
+  </template>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tensorboard-color.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tensorboard-color.html
new file mode 100644
index 0000000000000000000000000000000000000000..7f9ca6461485ad9b6356b05fac48544b4a995dfb
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tensorboard-color.html
@@ -0,0 +1,32 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+
+<style is="custom-style">
+
+  :root {
+    --tb-orange-weak: #ffa726;
+    --tb-orange-strong: #f57c00;
+    --tb-grey-darker: #e2e2e2;
+    --tb-grey-lighter: #f3f3f3;
+    --tb-ui-dark-accent: #757575;
+    --tb-ui-light-accent: #e0e0e0;
+    --tb-graph-faded: #e0d4b3;
+  }
+
+</style>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..fc9912d54a4d64e206294a1caec8d9037132e739
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/BUILD
@@ -0,0 +1,49 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/tf-dashboard-common/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4:tf-categorizer-bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.Dashboard": ["tf-categorizer-tests.ts"]},
+    namespace_symbol_aliases = {"TF.Dashboard": {"cat": "TF.Dashboard.Categorizer"}},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tests.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tests.html
new file mode 100644
index 0000000000000000000000000000000000000000..cd33cee47427fd4f7cce1deeb5932937aa810b8c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tests.html
@@ -0,0 +1,24 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<script src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../tf-categorizer.html">
+<body>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tf-categorizer-tests.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tf-categorizer-tests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..a786f39b4fb6f6c9560916e8ab863af8503780b9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tf-categorizer-tests.ts
@@ -0,0 +1,144 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as cat from '../tf-categorizer';
+
+let assert = chai.assert;
+
+describe('categorizer', () => {
+  describe('topLevelNamespaceCategorizer', () => {
+    it('returns empty array on empty tags', () => {
+      assert.lengthOf(cat.topLevelNamespaceCategorizer([]), 0);
+    });
+
+    it('handles a simple case', () => {
+      let simple = [
+        'foo1/bar', 'foo1/zod', 'foo2/bar', 'foo2/zod', 'gosh/lod/mar',
+        'gosh/lod/ned'
+      ];
+      let expected = [
+        {name: 'foo1', tags: ['foo1/bar', 'foo1/zod']},
+        {name: 'foo2', tags: ['foo2/bar', 'foo2/zod']},
+        {name: 'gosh', tags: ['gosh/lod/mar', 'gosh/lod/ned']},
+      ];
+      assert.deepEqual(cat.topLevelNamespaceCategorizer(simple), expected);
+    });
+
+    it('orders the categories', () => {
+      let test = ['e', 'f', 'g', 'a', 'b', 'c'];
+      let expected = [
+        {name: 'a', tags: ['a']},
+        {name: 'b', tags: ['b']},
+        {name: 'c', tags: ['c']},
+        {name: 'e', tags: ['e']},
+        {name: 'f', tags: ['f']},
+        {name: 'g', tags: ['g']},
+      ];
+      assert.deepEqual(cat.topLevelNamespaceCategorizer(test), expected);
+    });
+
+    it('handles cases where category names overlap node names', () => {
+      let test = ['a', 'a/a', 'a/b', 'a/c', 'b', 'b/a'];
+      const actual = cat.topLevelNamespaceCategorizer(test);
+      let expected = [
+        {name: 'a', tags: ['a', 'a/a', 'a/b', 'a/c']},
+        {name: 'b', tags: ['b', 'b/a']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('handles singleton case', () => {
+      assert.deepEqual(
+          cat.topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
+    });
+  });
+
+  describe('customCategorizer', () => {
+    function noFallbackCategorizer(tags: string[]): cat.Category[] {
+      return [];
+    }
+
+    function testCategorizer(
+        defs: string[], fallback: cat.Categorizer,
+        tags: string[]): cat.Category[] {
+      const catDefs = defs.map(cat.defineCategory);
+      return cat._categorizer(catDefs, fallback)(tags);
+    }
+
+    it('categorizes by regular expression', () => {
+      let defs = ['foo..', 'bar..'];
+      let tags = ['fooab', 'fooxa', 'barts', 'barms'];
+      const actual = testCategorizer(defs, noFallbackCategorizer, tags);
+      let expected = [
+        {name: 'foo..', tags: ['fooab', 'fooxa']},
+        {name: 'bar..', tags: ['barms', 'barts']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('matches non-exclusively', () => {
+      let tags = ['abc', 'bar', 'zod'];
+      const actual =
+          testCategorizer(['...', 'bar'], noFallbackCategorizer, tags);
+      let expected = [
+        {name: '...', tags: ['abc', 'bar', 'zod']},
+        {name: 'bar', tags: ['bar']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('creates categories for unmatched rules', () => {
+      const actual =
+          testCategorizer(['a', 'b', 'c'], noFallbackCategorizer, []);
+      let expected = [
+        {name: 'a', tags: []},
+        {name: 'b', tags: []},
+        {name: 'c', tags: []},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('category regexs work with special characters', () => {
+      let defs = ['^\\w+$', '^\\d+$', '^\\/..$'];
+      let tags = ['foo', '3243', '/xa'];
+      const actual = testCategorizer(defs, noFallbackCategorizer, tags);
+      let expected = [
+        {name: '^\\w+$', tags: ['3243', 'foo']},
+        {name: '^\\d+$', tags: ['3243']},
+        {name: '^\\/..$', tags: ['/xa']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('category tags are sorted', () => {
+      let tags = ['a', 'z', 'c', 'd', 'e', 'x', 'f', 'y', 'g'];
+      let sorted = tags.slice().sort();
+      let expected = [{name: '.*', tags: sorted}];
+      const actual = testCategorizer(['.*'], noFallbackCategorizer, tags);
+      assert.deepEqual(actual, expected);
+    });
+
+    it('if nonexclusive: all tags passed to fallback', () => {
+      let passedToDefault = null;
+      function defaultCategorizer(tags: string[]): cat.Category[] {
+        passedToDefault = tags;
+        return [];
+      }
+      let tags = ['foo', 'bar', 'foo123'];
+      testCategorizer(['foo'], defaultCategorizer, tags);
+      assert.deepEqual(passedToDefault, tags);
+    });
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer-demo.html
new file mode 100644
index 0000000000000000000000000000000000000000..23babaaecc4d2fe1b31fa0e930a608a41c307f90
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer-demo.html
@@ -0,0 +1,106 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+ <head>
+  <link rel="import" href="tf-categorizer.html">
+ </head>
+ <body>
+  <style>
+  </style>
+  <dom-module id="x-demo">
+    <style>
+      .container {
+        width: 255px;
+        padding: 10px;
+        border: 1px solid var(--paper-indigo-900);
+        border-radius: 5px;
+        position: fixed;
+      }
+      :host {
+        margin: 0px;
+      }
+
+      .categories {
+        font-family: "RobotoDraft",Helvetica;
+        margin-left: 300px;
+        width: 500px;
+        border: 1px solid var(--paper-indigo-500);
+        border-radius: 5px;
+      }
+
+      .category {
+        background-color: var(--paper-indigo-50);
+        margin: 20px;
+        padding: 20px;
+        border-radius: 5px;
+      }
+
+      .cat-name {
+        font-size: 20px;
+      }
+
+      .tag {
+        border-radius: 5px;
+        padding: 5px;
+        margin: 5px;
+        background-color: var(--paper-indigo-900);
+        color: white;
+      }
+    </style>
+    <template>
+      <div class="container">
+        <tf-categorizer categories="{{categories}}" tags="[[tags]]" id="demo"></tf-categorizer>
+      </div>
+      <div class="categories">
+        <template is="dom-repeat" items="[[categories]]">
+          <div class="category">
+            <p class="cat-name">Category: <span>[[item.name]]</span></p>
+            <div class="tags-container layout horizontal wrap">
+              <template is="dom-repeat" items="[[item.tags]]">
+                <span class="tag layout vertical center-center">[[item]]</span>
+              </template>
+            </div>
+          </div>
+        </template>
+      </div>
+    </template>
+    <script>
+
+    function tagsGenerator() {
+      var tags = ["special1", "special2", "special3", "special4", "special5"];
+      ["l1", "l2", "l3", "l4", "l5"].forEach(function(l) {
+        ["foo", "bar", "baz", "boink", "zod", "specialx"].forEach(function(x) {
+          tags.push(l + "/" + x);
+        });
+      });
+      return tags;
+    }
+
+    Polymer({
+      is: "x-demo",
+      properties: {
+        tags: { type: Array, value: tagsGenerator },
+      },
+    });
+    </script>
+  </dom-module>
+
+  <x-demo id="demo"></x-demo>
+ </body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.html
new file mode 100644
index 0000000000000000000000000000000000000000..6388ab5e7d4ed490514ef180d2ad8b98494ab618
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.html
@@ -0,0 +1,63 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../vz-sorting/vz-sorting.html">
+<link rel="import" href="tf-regex-group.html">
+<link rel="import" href="tensorboard-color.html">
+
+<!--
+`tf-categorizer` turns an array of tags into an array of categories
+
+The transformation from tags to categories is controlled by the user, through
+interacting with the categorizer widget.
+
+(See type signatures in categorizer.ts)
+
+Example:
+  <tf-categorizer tags="[[tags]]" categories="{{categories}}"></tf-categorizer>
+
+Public Properties:
+`tags` - Array of strings that are the tags to categorize. Should be one-way bound downward.
+`categories` - Array of Categorizer.Category objects that are generated by the Categorizer.
+  Are readOnly and notify: True. Expected to be one-way bound upward.
+
+The categorizer provides inputs for adding regular expression rules and toggling whether
+categories are exclusive.
+-->
+<dom-module id="tf-categorizer">
+  <template>
+    <div class="inputs">
+      <tf-regex-group id="regexGroup" regexes="{{regexes}}"></tf-regex-group>
+    </div>
+    <style>
+      :host {
+        display: block;
+        padding-bottom: 5px;
+      }
+      paper-checkbox {
+        --paper-checkbox-checked-color: var(--paper-grey-600);
+        --paper-checkbox-unchecked-color: var(--paper-grey-600);
+        font-size: 14px;
+      }
+    </style>
+  </template>
+  <script src="tf-categorizer-bundle.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.ts
new file mode 100644
index 0000000000000000000000000000000000000000..5d3307809724c1be3084c7454eb04d1c7bf215aa
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.ts
@@ -0,0 +1,192 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
+import * as _ from 'lodash';
+
+import {compareTagNames} from '../vz-sorting/sorting';
+
+/**
+ * This module contains methods that allow sorting tags into 'categories'.
+ * A category contains a name and a list of tags.
+ * The sorting strategy is defined by a 'CustomCategorization', which contains
+ * 'categoryDefinitions' which are regex rules used to construct a category.
+ * E.g. the regex rule 'xent' will create a category called 'xent' that
+ * contains values whose tags match the regex.
+ *
+ * After custom categories are evaluated, the tags are sorted by a hardcoded
+ * fallback categorizer, which may, for example, group tags into categories
+ * based on their top namespace.
+ */
+
+export interface Category {
+  // Categories that data is sorted into
+  name: string;
+  tags: string[];
+}
+
+export interface CustomCategorization {
+  // Defines a categorization strategy
+  categoryDefinitions: string[];
+  fallbackCategorizer: string;
+  /* {'TopLevelNamespaceCategorizer',
+      'LegacyUnderscoreCategorizer'} */
+}
+
+export interface Categorizer {
+  // Function that generates categories
+  (tags: string[]): Category[];
+}
+
+/* Canonical TensorFlow ops are namespaced using forward slashes.
+ * This fallback categorizer categorizes by the top-level namespace.
+ */
+export var topLevelNamespaceCategorizer: Categorizer = splitCategorizer(/\//);
+
+export function fallbackCategorizer(s: string): Categorizer {
+  switch (s) {
+    case 'TopLevelNamespaceCategorizer':
+      return topLevelNamespaceCategorizer;
+    default:
+      throw new Error('Unrecognized categorization strategy: ' + s);
+  }
+}
+
+/* An 'extractor' is a function that takes a tag name, and 'extracts' a
+ * category name.
+ * This function takes an extractor, and produces a categorizer.
+ * Currently, it is just used for the fallbackCategorizer, but we may want to
+ * refactor the general categorization logic to use the concept of extractors.
+ */
+function extractorToCategorizer(extractor: (s: string) => string): Categorizer {
+  return (tags: string[]): Category[] => {
+    if (tags.length === 0) {
+      return [];
+    }
+
+    // Maps between top-level name and category. We use the mapping to avoid
+    // duplicating categories per run.
+    const categoryMapping: {[key: string]: Category} = {};
+
+    tags.forEach((t: string) => {
+      const topLevel = extractor(t);
+      if (!categoryMapping[topLevel]) {
+        const newCategory = {
+          name: topLevel,
+          tags: [],
+        };
+        categoryMapping[topLevel] = newCategory;
+      }
+
+      categoryMapping[topLevel].tags.push(t);
+    });
+
+    // Sort categories into alphabetical order.
+    const categories =
+        _.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
+    _.forEach(categories, (category) => {
+      // Sort the tags within each category.
+      category.tags.sort(compareTagNames);
+    });
+    return categories;
+  };
+}
+
+function splitCategorizer(r: RegExp): Categorizer {
+  let extractor = (t: string) => {
+    return t.split(r)[0];
+  };
+  return extractorToCategorizer(extractor);
+}
+
+export interface CategoryDefinition {
+  name: string;
+  matches: (t: string) => boolean;
+}
+
+export function defineCategory(ruledef: string): CategoryDefinition {
+  let r = new RegExp(ruledef);
+  let f = function(tag: string): boolean {
+    return r.test(tag);
+  };
+  return {name: ruledef, matches: f};
+}
+
+export function _categorizer(
+    rules: CategoryDefinition[], fallback: Categorizer) {
+  return function(tags: string[]): Category[] {
+    let remaining: d3.Set = d3.set(tags);
+    let userSpecified = rules.map((def: CategoryDefinition) => {
+      let tags: string[] = [];
+      remaining.each((t: string) => {
+        if (def.matches(t)) {
+          tags.push(t);
+        }
+      });
+      let cat = {name: def.name, tags: tags.sort(compareTagNames)};
+      return cat;
+    });
+    let defaultCategories = fallback(remaining.values());
+    return userSpecified.concat(defaultCategories);
+  };
+}
+
+export function categorizer(s: CustomCategorization): Categorizer {
+  let rules = s.categoryDefinitions.map(defineCategory);
+  let fallback = fallbackCategorizer(s.fallbackCategorizer);
+  return _categorizer(rules, fallback);
+};
+
+Polymer({
+  is: 'tf-categorizer',
+  properties: {
+    regexes: {type: Array},
+    tags: {type: Array},
+    categoriesAreExclusive: {type: Boolean, value: true},
+    fallbackCategorizer: {
+      type: String,
+      value: 'TopLevelNamespaceCategorizer',
+    },
+    categorizer: {
+      type: Object,
+      computed:
+          'computeCategorization(regexes.*, categoriesAreExclusive, fallbackCategorizer)',
+    },
+    categories: {
+      type: Array,
+      value: function() {
+        return [];
+      },
+      notify: true,
+      readOnly: true
+    },
+  },
+  observers: ['recategorize(tags.*, categorizer)'],
+  computeCategorization: function(
+      regexes, categoriesAreExclusive, fallbackCategorizer) {
+    var categorizationStrategy = {
+      categoryDefinitions: regexes.base,
+      categoriesAreExclusive: categoriesAreExclusive,
+      fallbackCategorizer: fallbackCategorizer,
+    };
+    return categorizer(categorizationStrategy);
+  },
+  recategorize: function() {
+    this.debounce('tf-categorizer-recategorize', function() {
+      var categories = this.categorizer(this.tags);
+      this._setCategories(categories);
+    })
+  },
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-chart-scaffold.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-chart-scaffold.html
new file mode 100644
index 0000000000000000000000000000000000000000..9cacb7f5c894fd0c73d56dac5858d883543f981c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-chart-scaffold.html
@@ -0,0 +1,152 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+
+<!--
+tf-chart-scaffold is responsible for providing data from TensorBoard to charts.
+It has the following settable properties:
+tag: (required, string) - the name of the tag to load for this chart
+visibleSeries: (required, string[]) - the names of the series the chart should
+    display.
+dataProvider: (required, VZ.ChartHelpers.DataFn) - function that takes (tag,
+    run) and returns a promise containing an array of VZ.ChartHelpers.Datum,
+    compatible with TF.Backend.Datum.
+
+It exposes the following methods:
+chart() - Returns the underlying chart element.
+reload() - Reloads the data and sends it to the underlying chart.
+
+This element should have a compatible chart plugin element as it's content. The
+plugin is requred to implement two functions:
+- setVisibleSeries(names: string[]): a function that receives an array of series
+    names as the first parameter, responsible for changing the series currently
+    being displayed to only the series in this array.
+- setSeriesData(name: string, data: VZ.ChartHelpers.Datum[]): sets the data of
+    the series with the given name to the data given in the second parameter.
+-->
+<dom-module id="tf-chart-scaffold">
+  <template>
+    <content></content>
+    <style>
+      :host {
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        display: flex;
+        flex-direction: column;
+        flex-grow: 1;
+        flex-shrink: 1;
+        position: relative;
+      }
+    </style>
+  </template>
+  <script>
+    "use strict";
+
+    Polymer({
+      is: "tf-chart-scaffold",
+      properties: {
+        tag: String,
+        dataProvider: Function,
+        visibleSeries: Array,
+        _attached: {
+          type: Boolean,
+          value: false
+        },
+
+        // Storing the update ID of the previous request for data enables us to determine if a
+        // data response is outdated. We rely on an increasing ID instead of timestamp because
+        // successive updates often fire within the same millisecond.
+        _dataUpdateIdOfLastRequest: Number,
+        _nextAvailableDataUpdateId: {
+          type: Number,
+          value: 1,
+        },
+      },
+      observers: [
+        "reload(tag, dataProvider)",
+        "_changeSeries(visibleSeries.*)"
+      ],
+      ready: function() {
+        this.fire('ready');
+      },
+      attached: function() {
+        this._attached = true;
+        this._changeSeries();
+      },
+      detached: function() {
+        this._attached = false;
+      },
+      reload: function() {
+        if (!this._attached) {
+          return;
+        }
+        else if (!this.dataProvider) {
+          throw new Error('tf-chart-scaffold requires a dataProvider.');
+        }
+        else if (!this.tag) {
+          throw new Error('tf-chart-scaffold requires a tag.');
+        }
+
+        // TODO(chizeng): At this point, notify effective children that the previous data has been
+        // invalidated. For instance, the image dashboard may want to clear its images. Today, the
+        // chart scaffold only informs children when the new image URLs response finishes loading.
+
+        const dataUpdateId = this._nextAvailableDataUpdateId++;
+        this._dataUpdateIdOfLastRequest = dataUpdateId;
+
+        this.visibleSeries.forEach(function(name) {
+          this.dataProvider(this.tag, name).then(function(data) {
+            if (dataUpdateId != this._dataUpdateIdOfLastRequest) {
+              // This response is outdated. Ignore it.
+              // TODO(chizeng): Explore canceling an outdated request before we even receive its
+              // response. This involves creating hooks into the request manager and might introduce
+              // some complexity that may not be worth it; Tensorboard frankly does not seem
+              // bottlenecked by the network (It is often run in fast corp networks or locally.).
+              return;
+            }
+            this.chart().setSeriesData(name, data);
+          }.bind(this));
+        }.bind(this));
+      },
+      _changeSeries: function() {
+        if (!this._attached) {
+           return;
+        }
+        else if (!this.visibleSeries) {
+          throw new Error('tf-chart-scaffold requires a visibleSeries.');
+        }
+
+        this.chart().setVisibleSeries(this.visibleSeries);
+        this.reload();
+      },
+      chart: function() {
+        var children = this.getEffectiveChildren();
+        if (!children.length) {
+          throw new Error('tf-chart-scaffold has no children');
+        }
+
+        var child = children[0];
+        if (!child.setVisibleSeries || !child.setSeriesData) {
+          throw new Error("tf-chart-scaffold's content doesn't implement the " +
+              "required interface");
+        }
+        return child;
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane-demo.html
new file mode 100644
index 0000000000000000000000000000000000000000..6c8bdb92ee79ddb0bd018d1eeb13ee9af40f2b9a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane-demo.html
@@ -0,0 +1,34 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+ <head>
+   <link rel="import" href="tf-collapsable-pane.html">
+   
+ </head>
+ <body>
+  <style>
+  </style>
+  <tf-collapsable-pane name="foo">
+    <h1>This is content inside the pane.</h1>
+  </tf-collapsable-pane>
+ </body>
+ <script>
+
+ </script>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane.html
new file mode 100644
index 0000000000000000000000000000000000000000..e82540127fa5c765cde178dcc1d17014854990d2
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane.html
@@ -0,0 +1,109 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../iron-collapse/iron-collapse.html">
+
+<dom-module id="tf-collapsable-pane">
+  <template>
+    <button
+      class="heading"
+      on-tap="togglePane"
+      open-button$="[[opened]]"
+    >
+    <span class="name">[[name]]</span>
+    <span class="count">
+      <span>[[count]]</span>
+    </span>
+  </button>
+    <iron-collapse opened="[[opened]]">
+      <div class="content">
+        <template is="dom-if" if="[[opened]]" restamp="[[restamp]]">
+          <content></content>
+        </template>
+      </div>
+    </iron-collapse>
+    <style>
+      :host {
+        display: block;
+        margin: 0 5px 1px 10px;
+      }
+
+      :host:first-of-type {
+        margin-top: 20px;
+      }
+      
+      :host:last-of-type {
+        margin-bottom: 20px;
+      }
+
+      .heading {
+        background-color: white;
+        border: none;
+        cursor: pointer;
+        width: 100%;
+        font-size: 15px;
+        line-height: 1;
+        box-shadow: 0 1px 5px rgba(0,0,0,0.2);
+        padding: 10px 15px;
+      }
+
+      .content {
+        padding: 15px;
+        border: 1px solid #dedede;
+        border-top: none;
+        border-bottom-left-radius: 2px;
+        border-bottom-right-radius: 2px;
+        background: white;
+      }
+
+      [open-button] {
+        border-bottom-left-radius: 0px !important;
+        border-bottom-right-radius: 0px !important;
+      }
+
+      .name {
+        float: left;
+      }
+
+      .count {
+        float: right;
+        margin-right: 5px;
+        font-size: 12px;
+        color: var(--paper-grey-500);
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-collapsable-pane",
+      properties: {
+        opened: {type: Boolean, value: false},
+        restamp: {type: Boolean, value: true},
+        name: {type: String, observer: "hide"},
+        count: {type: Number},
+      },
+      hide: function() {
+        this.opened = false;
+      },
+      togglePane: function() {
+        this.opened = !this.opened;
+      }
+    });
+  </script>
+
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard-layout.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard-layout.html
new file mode 100644
index 0000000000000000000000000000000000000000..e0e8a2b52c38965b78e254cf1c6c0bf4b5c0d4b3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard-layout.html
@@ -0,0 +1,67 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="scrollbar-style.html">
+<link rel="import" href="tensorboard-color.html">
+
+<!--
+Generic layout for a dashboard.
+-->
+<dom-module id="tf-dashboard-layout">
+  <template>
+    <div id="sidebar">
+      <content select=".sidebar"></content>
+    </div>
+
+    <div id="center" class="scrollbar">
+      <content select=".center"></content>
+    </div>
+    <style include="scrollbar-style"></style>
+    <style>
+      #sidebar {
+        width: inherit;
+        height: 100%;
+        overflow: ellipsis;
+        flex-grow: 0;
+        flex-shrink: 0;
+      }
+
+      #center {
+        height: 100%;
+        overflow-y: auto;
+        flex-grow: 1;
+        flex-shrink: 1;
+      }
+
+      .tf-graph-dashboard #center {
+        background: white;
+      }
+
+      :host {
+        display: flex;
+        flex-direction: row;
+        height: 100%;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-dashboard-layout",
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..475c2cef3bd6c358d15adb09ccdc7790af539fc9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard.html
@@ -0,0 +1,25 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="tf-dashboard-layout.html">
+<link rel="import" href="tensorboard-color.html">
+<link rel="import" href="dashboard-style.html">
+<link rel="import" href="tf-downloader.html">
+<link rel="import" href="tf-no-data-warning.html">
+
+<script src="tf-dashboard.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-downloader.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-downloader.html
new file mode 100644
index 0000000000000000000000000000000000000000..719142595984e2e529c2b569098efbe5258e6906
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-downloader.html
@@ -0,0 +1,99 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
+<link rel="import" href="../paper-menu/paper-menu.html">
+<link rel="import" href="../paper-item/paper-item.html">
+
+<dom-module id="tf-downloader">
+  <template>
+    <paper-dropdown-menu
+      no-label-float="true"
+      label="run to download"
+      selected-item-label="{{_run}}"
+    >
+      <paper-menu class="dropdown-content">
+        <template is="dom-repeat" items="[[runs]]">
+          <paper-item no-label-float=true>[[item]]</paper-item>
+        </template>
+      </paper-menu>
+    </paper-dropdown-menu>
+    <div class="center">
+      <span>
+        <a
+          download="[[_csvName(_run)]]"
+          href="[[_csvUrl(_run, urlFn)]]"
+          >CSV</a>
+        <a
+          download="[[_jsonName(_run)]]"
+          href="[[_jsonUrl(_run, urlFn)]]"
+          >JSON</a>
+      </span>
+    </div>
+    <style>
+      :host {
+        display: flex;
+        height: 32px;
+      }
+      .center {
+        display: flex;
+        align-self: center;
+      }
+      paper-dropdown-menu {
+        width: 100px;
+        --paper-input-container-label: {
+          font-size: 10px;
+        }
+        --paper-input-container-input: {
+          font-size: 10px;
+        }
+      }
+      a {
+        font-size: 10px;
+        border-radius: 3px;
+        border: 1px solid #EEE;
+      }
+      paper-input {
+        font-size: 22px;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-downloader",
+      properties: {
+        _run: String,
+        runs: Array,
+        tag: String,
+        urlFn: Function,
+      },
+      _csvUrl: function(_run, urlFn) {
+        return urlFn(this.tag, _run) + "&format=csv";
+      },
+      _jsonUrl: function(_run, urlFn) {
+        return urlFn(this.tag, _run);
+      },
+      _csvName: function(_run) {
+        return "run_" + _run + ",tag_" + this.tag + ".csv";
+      },
+      _jsonName: function(_run) {
+        return "run-" + _run + "-tag-" + this.tag + ".json";
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox-demo.html
new file mode 100644
index 0000000000000000000000000000000000000000..d0f5aa6f27d7cf5351c5c50fc3be693ce1bd39d4
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox-demo.html
@@ -0,0 +1,176 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+<head>
+<link rel="import" href="../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="tf-multi-checkbox.html">
+
+</head>
+<body>
+<script>
+var seed = 1;
+function random() {
+  var x = Math.sin(seed++) * 10000;
+  return x - Math.floor(x);
+}
+</script>
+<style>
+</style>
+
+<dom-module id="mc-demo">
+  <template>
+    <tf-multi-checkbox
+      id="multiCheckbox"
+      names="[[names]]"
+      tooltips="[[_tooltips]]"
+      class-scale="[[classScale]]"
+      highlights="[[highlights]]"
+    ></tf-multi-checkbox>
+    <tf-color-scale
+      id="colorScale"
+      runs="[[names]]"
+      out-class-scale="{{classScale}}"
+    ></tf-color-scale>
+  <style>
+  </style>
+  </template>
+  <script>
+
+  function randomTooltip() {
+    var s = "";
+    while (random() < 0.8) {
+      s += String(10*random())[0];
+    }
+    return s;
+  }
+  Polymer({
+    is: "mc-demo",
+    properties: {
+      names: Array,
+      tooltips: Object,
+      autoGenerateTooltips: {value: true},
+      _tooltips: Object,
+      classScale: Function,
+      highlights: Array,
+    },
+    observers: [
+      'autogenerate(names, autoGenerateTooltips)',
+      'randomHighlights(names)'
+    ],
+    autogenerate: function(names, autoGenerateTooltips) {
+      if (autoGenerateTooltips) {
+        var tooltips = {};
+        names.forEach(function(n) {
+        if (random() > 0.5) {
+          tooltips[n] = randomTooltip();
+        }
+      });
+      this._tooltips = tooltips;
+      }
+    },
+    randomHighlights: function(names) {
+      var h = [];
+      names.forEach(function(n) {
+        if (random() > 0.6) {
+          h.push(n);
+        }
+      });
+      this.highlights = h;
+    }
+  });
+  </script>
+</dom-module>
+
+<dom-module id="x-demo">
+<style>
+.small {
+  width: 200px;
+  height: 500px;
+}
+.large {
+  width: 500px;
+  height: 900px;
+}
+html,body {
+  height: 100%;
+}
+mc-demo {
+  padding: 5px;
+  border: 1px solid var(--paper-red-500);
+  display: inline-block;
+}
+</style>
+<template>
+  <div class="demo-block">
+    <mc-demo id="demo1" class="small" names="[[long_names]]"></mc-demo>
+    <mc-demo class="small" names="[[many_names]]"></mc-demo>
+    <mc-demo class="small" names="[[many_long_names]]"></mc-demo>
+  </div>
+
+  <div class="demo-block">
+    <mc-demo class="large" names="[[long_names]]"></mc-demo>
+    <mc-demo class="large" names="[[many_names]]"></mc-demo>
+    <mc-demo class="large" names="[[many_long_names]]"></mc-demo>
+  </div>
+
+</template>
+<script>
+
+function long_names() {
+  return [
+    "foo_bar very long name with spaces",
+    "the quick brown fox jumped over the lazy dog",
+    "supercalifragilisticexpialodcious/bar/foo/zod/longer/longer",
+  ];
+}
+
+function many_names() {
+  var out = [];
+  for (var i=0; i<20; i++) {
+    out.push("foo_bar-" + i);
+    out.push("bar_zod_bing-" + i);
+    out.push("lol-" + i);
+  }
+  return out;
+}
+
+function many_long_names() {
+  var out = [];
+  for (var i=0; i<20; i++) {
+    out.push("foo_bar very very very long some spaces though-" + i);
+    out.push("bar_zod_bing_bas_womp_wub_wub_dub_wub_wub-" + i);
+    out.push("rightly_to_be_great_is_not_to_stir_without_great_argument_but_greatly_to_find_quarrel_in_a_straw_when_honors_at_the_stake-" + i);
+  }
+  return out;
+}
+
+Polymer({
+  is: "x-demo",
+  properties: {
+  long_names: {type: Array, value: long_names},
+  many_names: {type: Array, value: many_names},
+  many_long_names: {type: Array, value: many_long_names},
+},
+});
+</script>
+</dom-module>
+
+<x-demo id="demo"></x-demo>
+</body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.html
new file mode 100644
index 0000000000000000000000000000000000000000..8a56616f820f19c15a0097051abfaad929332d65
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.html
@@ -0,0 +1,160 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../iron-icons/iron-icons.html">
+<link rel="import" href="../paper-checkbox/paper-checkbox.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../paper-input/paper-input.html">
+<link rel="import" href="../tf-storage/tf-storage.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="scrollbar-style.html">
+<link rel="import" href="run-color-style.html">
+
+<!--
+tf-multi-checkbox creates a list of checkboxes that can be used to toggle on or off
+a large number of values. Each checkbox displays a name, and may also have an
+associated tooltip value. Checkboxes can be highlighted, hidden, and re-ordered.
+
+tf-multi-checkbox assumes that the names may be very long compared to the width
+of the checkbox, and the number of names may also be very large, and works to
+handle these situations gracefully.
+-->
+<dom-module id="tf-multi-checkbox">
+  <style include="scrollbar-style"></style>
+  <style include="run-color-style"></style>
+
+  <template>
+      <paper-input
+        id="runs-regex"
+        no-label-float
+        label="Write a regex to filter runs"
+        value="[[regexInput]]"
+        on-bind-value-changed="_debouncedRegexChange"
+      ></paper-input>
+    <div id="outer-container" class="scrollbar">
+      <template
+        is="dom-repeat"
+        items="[[namesMatchingRegex]]"
+      >
+        <div
+          class="run-row"
+        >
+          <div class="icon-container checkbox-container vertical-align-container">
+            <paper-checkbox
+              class="checkbox vertical-align-center"
+              name="[[item]]"
+              checked$="[[_isChecked(item, runSelectionState.*)]]"
+              on-change="_checkboxChange"
+            ></paper-checkbox>
+
+          </div>
+          <div class="icon-container isolator-container vertical-align-container">
+            <paper-icon-button
+              icon="radio-button-unchecked"
+              class="isolator vertical-align-center"
+              on-tap="_isolateRun"
+              name="[[item]]"
+            ></paper-icon-button>
+          </div>
+          <div class="item-label-container">
+            <span>[[item]]</span>
+          </div>
+        </div>
+      </template>
+    </div>
+  <style>
+    paper-input {
+      --paper-input-container-focus-color: var(--tb-orange-strong);
+      --paper-input-container-input: {
+        font-size: 14px;
+      };
+      --paper-input-container-label: {
+        font-size: 14px;
+      };
+    }
+    :host {
+      display: flex;
+      flex-direction: column;
+      height: 100%;
+    }
+    #outer-container {
+      overflow-y: auto;
+      overflow-x: hidden;
+      width: 100%;
+      height: 0; /* Quirk to make firefox add scrolling instead of expand div */
+      flex-grow: 1;
+      flex-shrink: 1;
+      word-wrap: break-word;
+    }
+    .run-row {
+      padding-top: 5px;
+      padding-bottom: 5px;
+      display: flex;
+      flex-direction: row;
+      font-size: 13px;
+    }
+    .icon-container {
+      flex-grow: 0;
+      flex-shrink: 0;
+      padding-left: 2px;
+    }
+    .checkbox {
+      padding-left: 2px;
+      width: 18px;
+      height: 18px;
+    }
+    .isolator {
+      width: 18px;
+      height: 18px;
+      padding: 0px;
+    }
+    .isolator-container {
+      padding-left: 6px;
+      padding-right: 3px;
+    }
+    .checkbox-container {
+      padding-left: 2px;
+    }
+    .item-label-container {
+      padding-left: 5px;
+      flex-grow: 1;
+      flex-shrink: 1;
+      width: 0px; /* hack to get the flex-grow to work properly */
+    }
+    .tooltip-value-container {
+      display: flex;
+      justify-content: center;
+      flex-grow: 0;
+      flex-shrink: 0;
+      text-align:right;
+      padding-left: 2px;
+    }
+    .vertical-align-container {
+      display: flex;
+      justify-content: center;
+    }
+    .vertical-align-container .vertical-align-center {
+      align-self: center;
+    }
+    .vertical-align-container .vertical-align-top {
+      align-self: start;
+    }
+  </style>
+  </template>
+  <script src="tf-multi-checkbox-bundle.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.ts
new file mode 100644
index 0000000000000000000000000000000000000000..44a14a21cfeb2ae75804d9803035b76fa8e29d68
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.ts
@@ -0,0 +1,206 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as _ from 'lodash';
+import * as storage from '../tf-storage/storage';
+
+Polymer({
+  is: 'tf-multi-checkbox',
+  properties: {
+    names: {
+      type: Array,
+      value: function() {
+        return [];
+      },
+    },  // All the runs in consideration
+    regexInput: {
+      type: String,
+      value: storage.getStringInitializer('regexInput', ''),
+      observer: '_regexInputObserver',
+    },  // Regex for filtering the runs
+    regex: {type: Object, computed: '_makeRegex(regexInput)'},
+    namesMatchingRegex: {
+      type: Array,
+      computed: 'computeNamesMatchingRegex(names.*, regex)'
+    },  // Runs that match the regex
+    runSelectionState: {
+      // if a run is explicitly enabled, True, if explicitly disabled, False.
+      // if undefined, default value (enable for first k runs, disable after).
+      type: Object,
+      value: storage.getObjectInitializer('runSelectionState', {}),
+      observer: '_storeRunToIsCheckedMapping',
+    },
+    // (Allows state to persist across regex filtering)
+    outSelected: {
+      type: Array,
+      notify: true,
+      computed: 'computeOutSelected(namesMatchingRegex.*, runSelectionState.*)'
+    },
+    colorScale: {
+      type: Object,
+      observer: 'synchronizeColors',
+    },  // map from run name to css class
+    maxRunsToEnableByDefault: {
+      // When TB first loads, if it has k or fewer runs, they are all enabled
+      // by default. If there are more, then they are all disabled.
+      type: Number,
+      value: 40,
+    },
+    _debouncedRegexChange: {
+      type: Object,
+      // Updating the regex can be slow, because it involves updating styles
+      // on a large number of Polymer paper-checkboxes. We don't want to do
+      // this while the user is typing, as it may make a bad, laggy UI.
+      // So we debounce the updates that come from user typing.
+      value: function() {
+        const _this = this;
+        var debounced = _.debounce(function(r) {
+          _this.regexInput = r;
+        }, 150, {leading: false});
+        return function() {
+          var r = this.$$('#runs-regex').value;
+          if (r == '') {
+            // If the user cleared the field, they may be done typing, so
+            // update more quickly.
+            this.async(function() {
+              _this.regexInput = r;
+            }, 30);
+          } else {
+            debounced(r);
+          };
+        };
+      },
+    },
+  },
+  listeners: {
+    'dom-change': 'synchronizeColors',
+  },
+  observers: [
+    '_setIsolatorIcon(runSelectionState, names)',
+  ],
+  _storeRunToIsCheckedMapping:
+      storage.getObjectObserver('runSelectionState', {}),
+  _makeRegex: function(regex) {
+    try {
+      return new RegExp(regex)
+    } catch (e) {
+      return null;
+    }
+  },
+  _setIsolatorIcon: function() {
+    var runMap = this.runSelectionState;
+    var numChecked = _.filter(_.values(runMap)).length;
+    var buttons =
+        Array.prototype.slice.call(this.querySelectorAll('.isolator'));
+
+    buttons.forEach(function(b) {
+      if (numChecked === 1 && runMap[b.name]) {
+        b.icon = 'radio-button-checked';
+      } else {
+        b.icon = 'radio-button-unchecked';
+      }
+    });
+  },
+  computeNamesMatchingRegex: function(__, ___) {
+    var regex = this.regex;
+    return this.names.filter(function(n) {
+      return regex == null || regex.test(n);
+    });
+  },
+  computeOutSelected: function(__, ___) {
+    var runSelectionState = this.runSelectionState;
+    var num = this.maxRunsToEnableByDefault;
+    var allEnabled = this.namesMatchingRegex.length <= num;
+    return this.namesMatchingRegex.filter(function(n, i) {
+      return runSelectionState[n] == null ? allEnabled : runSelectionState[n];
+    });
+  },
+  synchronizeColors: function(e) {
+    if (!this.colorScale) return;
+
+    this._setIsolatorIcon();
+
+    var checkboxes =
+        Array.prototype.slice.call(this.querySelectorAll('paper-checkbox'));
+    var scale = this.colorScale;
+    checkboxes.forEach(function(p) {
+      var color = scale.scale(p.name);
+      p.customStyle['--paper-checkbox-checked-color'] = color;
+      p.customStyle['--paper-checkbox-checked-ink-color'] = color;
+      p.customStyle['--paper-checkbox-unchecked-color'] = color;
+      p.customStyle['--paper-checkbox-unchecked-ink-color'] = color;
+    });
+    var buttons =
+        Array.prototype.slice.call(this.querySelectorAll('.isolator'));
+    buttons.forEach(function(p) {
+      var color = scale.scale(p.name);
+      p.style['color'] = color;
+    });
+    // The updateStyles call fails silently if the browser doesn't have focus,
+    // e.g. if TensorBoard was opened into a new tab that isn't visible.
+    // So we wait for requestAnimationFrame.
+    var _this = this;
+    window.requestAnimationFrame(function() {
+      _this.updateStyles();
+    });
+  },
+  _isolateRun: function(e) {
+    // If user clicks on the label for one run, enable it and disable all other
+    // runs.
+
+    var name = (Polymer.dom(e) as any).localTarget.name;
+    var selectionState = {};
+    this.names.forEach(function(n) {
+      selectionState[n] = n == name;
+    });
+    this.runSelectionState = selectionState;
+  },
+  _checkboxChange: function(e) {
+    var target = (Polymer.dom(e) as any).localTarget;
+    this.runSelectionState[target.name] = target.checked;
+    // n.b. notifyPath won't work because run names may have periods.
+    this.runSelectionState = _.clone(this.runSelectionState);
+  },
+  _isChecked: function(item, outSelectedChange) {
+    return this.outSelected.indexOf(item) != -1;
+  },
+  _regexInputObserver: storage.getStringObserver('regexInput', ''),
+  toggleAll: function() {
+    var _this = this;
+    var anyToggledOn = this.namesMatchingRegex.some(function(n) {
+      return _this.runSelectionState[n]
+    });
+
+
+    var runSelectionStateIsDefault =
+        Object.keys(this.runSelectionState).length == 0;
+
+    var defaultOff =
+        this.namesMatchingRegex.length > this.maxRunsToEnableByDefault;
+    // We have runs toggled either if some were explicitly toggled on, or if
+    // we are in the default state, and there are few enough that we default
+    // to toggling on.
+    anyToggledOn = anyToggledOn || runSelectionStateIsDefault && !defaultOff;
+
+    // If any are toggled on, we turn everything off. Or, if none are toggled
+    // on, we turn everything on.
+
+    var newRunsDisabled = {};
+    this.names.forEach(function(n) {
+      newRunsDisabled[n] = !anyToggledOn;
+    });
+    this.runSelectionState = newRunsDisabled;
+  },
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-no-data-warning.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-no-data-warning.html
new file mode 100644
index 0000000000000000000000000000000000000000..c90efac1d6b58debc6a39ae4ffafaeb3fb093da1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-no-data-warning.html
@@ -0,0 +1,129 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+
+<!--
+Display a warning when there is no data found.
+-->
+<dom-module id="tf-no-data-warning">
+  <template>
+    <template is="dom-if" if="[[showWarning]]">
+      <div class="warning">
+        <template is="dom-if" if="[[_isGraph(dataType)]]">
+          <h3>
+            No graph definition files were found.
+          </h3>
+          <p>
+            To store a graph, create a
+            <code>tf.summary.FileWriter</code>
+            and pass the graph either via the constructor, or by calling its
+            <code>add_graph()</code> method.
+            You may want to check out the
+            <a href="https://www.tensorflow.org/get_started/graph_viz">
+              graph visualizer tutorial
+            </a>.
+          </p>
+        </template>
+        <template is="dom-if" if="[[_isProjector(dataType)]]">
+          <h3>
+            No checkpoint was found.
+          </h3>
+          <p>
+            Probable causes:
+            <ul>
+              <li>
+                No checkpoint has been saved yet. Please refresh the page periodically.
+              </li>
+              <li>
+                You are not saving any checkpoint. To save your model,
+                create a
+                <a href="https://www.tensorflow.org/api_docs/python/tf/train/Saver">
+                  <code>tf.train.Saver</code>
+                </a>
+                and save your model periodically
+                by calling <code>saver.save(session, LOG_DIR/model.ckpt, step)</code>.
+              </li>
+            </ul>
+          </p>
+        </template>
+        <template is="dom-if" if="[[_isOther(dataType)]]">
+          <h3>
+            No <span>[[dataType]]</span> data was found.
+          </h3>
+          <p>
+            Probable causes:
+            <ul>
+              <li>
+                You haven't written any <span>[[dataType]]</span> data
+                to your event files.
+              </li>
+              <li>
+                TensorBoard can't find your event files.
+              </li>
+            </ul>
+          </p>
+        </template>
+        <p>
+          If you're new to using TensorBoard, and want to find out how to add
+          data and set up your event files, check out the
+          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md">
+            README
+          </a>
+          and perhaps the
+          <a href="https://www.tensorflow.org/get_started/summaries_and_tensorboard">
+            TensorBoard tutorial
+          </a>.
+        </p>
+
+        <p>
+          If you think TensorBoard is configured properly, please see the
+          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md#my-tensorboard-isnt-showing-any-data-whats-wrong">
+            section of the README devoted to missing data problems
+          </a>
+          and consider filing an issue on GitHub.
+        </p>
+
+      </div>
+    </template>
+    <style>
+      .warning {
+        max-width: 540px;
+        margin: 80px auto 0 auto;
+      }
+    </style>
+  </template>
+
+  <script>
+    Polymer({
+      is: "tf-no-data-warning",
+      properties: {
+        dataType: String,
+        showWarning: Boolean
+      },
+      _isGraph: function(dataType) {
+        return dataType === "graph";
+      },
+      _isProjector: function(dataType) {
+        return dataType === "projector";
+      },
+      _isOther: function(dataType) {
+        return !this._isGraph(dataType) && !this._isProjector(dataType);
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-option-selector.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-option-selector.html
new file mode 100644
index 0000000000000000000000000000000000000000..547a558ad0b5da9305d88d2d678302be1f928f8b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-option-selector.html
@@ -0,0 +1,94 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="tensorboard-color.html">
+
+<!--
+tf-option-selector is a simple component that has buttons as content and
+provides a "selectedId" property that is one of the IDs of the buttons inside it.
+-->
+<dom-module id="tf-option-selector">
+  <template>
+    <div id="wrap">
+      <h3>[[name]]</h3>
+      <div class="content-wrapper"><content></content></div>
+    </div>
+    <style>
+      .content-wrapper ::content > * {
+        width: 30%;
+        font-size: 13px;
+        background: none;
+        margin-top: 10px;
+        color: var(--tb-ui-dark-accent);
+      }
+
+      .content-wrapper ::content :first-of-type {
+        margin-left: 0;
+      }
+
+      .content-wrapper ::content .selected {
+        background-color: var(--tb-ui-dark-accent);
+        color: white!important;
+      }
+
+      h3 {
+        color: var(--paper-grey-800);
+        margin: 0;
+        font-weight: normal;
+        font-size: 14px;
+        margin-bottom: 5px;
+        display: block;
+        pointer-events: none;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-option-selector",
+      properties: {
+        name: String,
+        selectedId: {
+          type: String,
+          notify: true,
+          observer: '_selectedIdChanged'
+        }
+      },
+      attached: function() {
+        this.async(function() {
+          this.getEffectiveChildren().forEach(function(node) {
+            this.listen(node, 'tap', '_selectTarget');
+          }.bind(this));
+        });
+      },
+      _selectTarget: function(e) {
+        this.selectedId = e.currentTarget.id;
+      },
+      _selectedIdChanged: function() {
+        var selected = this.queryEffectiveChildren('#' + this.selectedId);
+        if (!selected) {
+          return;
+        }
+
+        this.getEffectiveChildren().forEach(function(node) {
+          node.classList.remove("selected");
+        });
+        selected.classList.add("selected");
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-panes-helper.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-panes-helper.html
new file mode 100644
index 0000000000000000000000000000000000000000..155259d3294bd1caf5cc59f91c56f304d12091a0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-panes-helper.html
@@ -0,0 +1,352 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="tf-collapsable-pane.html">
+<link rel="import" href="tf-no-data-warning.html">
+<link rel="import" href="tf-chart-scaffold.html">
+
+<!--
+tf-panes-helper is a component that renders the contents of TensorBoard pages.
+It renders a tf-collapsable-pane for each category. Inside each category, the
+provided content template is rendered repeatedly for each tag within that
+category.
+
+This helper also incorporates an expand button and data download utility for
+each card.
+
+To use it, just specify a template inside tf-panes-helper that contains the
+code that will be replicated for each tag.
+
+<tf-panes-helper
+  categories="[[categories]]"
+  data-type="type"
+  data-provider="[[provider]]"
+  run2tag="[[run2tag]]"
+  selected-runs="[[selectedRuns]]"
+  >
+  <template>
+    <Code instantiated for each card>
+  </template>
+</tf-panes-helper>
+
+If you want for the template to be replicated for each tag and run, not only for
+each tag, you can set the repeatForRuns property to true.
+
+You can also set the showDownloadLinks property, which will show a menu with
+options to download JSON and CSV data. For this, you must also set the
+downloadLinkUrlFunction property to an appropriate value.
+
+@element tf-panes-helper
+-->
+<dom-module id="tf-panes-helper">
+  <template>
+    <content></content> <!-- User template will be put here -->
+    <tf-no-data-warning
+      data-type="[[dataType]]"
+      show-warning="[[dataNotFound]]"
+      ></tf-no-data-warning>
+
+    <template is="dom-repeat" items="[[categories]]" as="category">
+      <tf-collapsable-pane
+        name="[[category.name]]"
+        count="[[_count(category.tags, selectedRuns.*)]]"
+        >
+        <div class="layout horizontal wrap">
+          <template is="dom-repeat" items="[[_categoryCards(category, selectedRuns.*, run2tag.*)]]">
+              <div class="card">
+                <div class="card-title-container" style="border-color: [[_titleBorderColor(item.run)]]">
+                  <div class="card-title" inner-h-t-m-l="[[_break(item.tag)]]"></div>
+                  <template is="dom-if" if="[[repeatForRuns]]">
+                    <div class="card-subtitle" title="[[item.run]]">[[item.run]]</div>
+                  </template>
+                </div>
+                <div class="card-content">
+                  <tf-chart-scaffold
+                    tag="[[item.tag]]"
+                    data-provider="[[dataProvider]]"
+                    visible-series="[[item.runs]]"
+                    on-ready="_instantiateTemplate"
+                    >
+                    <!-- Instantiated template will be put here -->
+                  </tf-chart-scaffold>
+                </div>
+                <div class="card-bottom-row">
+                  <paper-icon-button
+                    class="expand-button"
+                    icon="fullscreen"
+                    on-tap="_toggleExpanded"
+                    ></paper-icon-button>
+                  <template is="dom-if" if="[[showDownloadLinks]]">
+                    <tf-downloader
+                      runs="[[item.runs]]"
+                      tag="[[item.tag]]"
+                      url-fn="[[downloadLinkUrlFunction]]"
+                      >
+                    </tf-downloader>
+                  </template>
+                </div>
+              </div>
+          </template>
+        </div>
+      </tf-collapsable-pane>
+    </template>
+
+    <style>
+      .card {
+        height: var(--card-height, 200px);
+        width: var(--card-width, 300px);
+        display: flex;
+        flex-direction: column;
+        margin: 5px;
+        padding: var(--card-padding, 0 30px 35px 0);
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        position: relative;
+      }
+
+      .card-expanded {
+        height: var(--card-expanded-height, 400px);
+        width: var(--card-expanded-width, 100%);
+      }
+
+      .card-title, .card-subtitle {
+        flex-grow: 0;
+        flex-shrink: 0;
+        font-size: 14px;
+        text-overflow: ellipsis;
+        overflow: hidden;
+      }
+
+      .card-subtitle {
+        font-size: 12px;
+      }
+
+      .card-content {
+        flex-grow: 1;
+        flex-shrink: 1;
+        display: flex;
+        margin-top: 10px;
+      }
+
+      .card-bottom-row {
+        position: absolute;
+        left: 0px;
+        bottom: 0px;
+        width: 100%;
+        display: flex;
+        flex-direction: row;
+        justify-content: space-between;
+        pointer-events: none;
+      }
+
+      .card-title-container {
+        border-left: 4px solid;
+        padding-left: 5px;
+      }
+
+      .expand-button {
+        color: #2196F3;
+        width: 32px;
+        height: 32px;
+        padding: 4px;
+        border-radius: 100%;
+        pointer-events: auto;
+        display: var(--show-expand-button, block);
+      }
+
+      .card-expanded .expand-button {
+        background: var(--tb-ui-light-accent);
+      }
+
+      tf-downloader {
+        margin-right: 30px;
+        pointer-events: auto;
+      }
+
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-panes-helper",
+      properties: {
+        /**
+         * Categories that separate the template instances. Each category will
+         * be given its own collapsible pane. The category must be an array of
+         * objects, each with a 'name' property and a 'tags' array of strings.
+         */
+        categories: Array,
+
+        /**
+         * Input of the colors that are used for the user's runs.
+         */
+        colorScale: Object,
+
+        /**
+         * The name of the data type that is used by this dashboard. This will
+         * be used to display what is missing when there is no data available.
+         */
+        dataType: String,
+
+        /**
+         * The function that requests and returns a promise with the data of the
+         * required type for the templates from the backend.
+         */
+        dataProvider: Object,
+
+        /**
+         * If false, instantiates one template for each tag and calls
+         * setVisibleSeries on the first element of the template with all valid
+         * runs the tag has. If true, instantiates one template for each run of
+         * each tag, and calls setVisibleSeries of the first element of the
+         * instantiated template with just the one run.
+         */
+        repeatForRuns: {
+          type: Boolean,
+          value: false
+        },
+
+        /**
+         * Map from runs to the valid tags that have them.
+         */
+        run2tag: Object,
+
+        /**
+         * Array with the runs that are selected by the user (i.e. valid to be
+         * displayed).
+         */
+        selectedRuns: Array,
+
+        /**
+         * If true, shows a menu with download links for the template data.
+         * If this is set to true, urlFn must also be provided.
+         */
+        showDownloadLinks: Boolean,
+
+        /**
+         * Function that returns the route to get data to download. Must be
+         * provided if showDownloadLinks is enabled.
+         */
+        downloadLinkUrlFunction: Function,
+        _contentTemplate: {
+          type: Object,
+          value: null
+        },
+        _stampedTemplates: {
+          type: Array,
+          value: function() { return [] }
+        }
+      },
+      behaviors: [
+        Polymer.Templatizer,
+      ],
+
+      /**
+       * Initializes the Polymer.Templatizer behavior with the template supplied
+       * by the user. With this, all calls to this.stamp() will produce an
+       * instance of the user template.
+       */
+      _initTemplatizer: function() {
+        if (!this._contentTemplate) {
+          // First template is used as the content.
+          this._contentTemplate = Polymer.dom(this).querySelector('template');
+          this.templatize(this._contentTemplate);
+        }
+      },
+
+      /**
+       * Called every time a tf-chart-scaffold is ready, stamps the user
+       * template inside the scaffold element (before it is attached) and
+       * stores the stamped template in an array to use for data binding
+       * (forwardParentProp/Path).
+       */
+      _instantiateTemplate: function(e) {
+        var scaffold = e.target;
+        this._initTemplatizer();
+        var instance = this.stamp();
+        this._stampedTemplates.push(instance);
+        Polymer.dom(scaffold).appendChild(instance.root);
+      },
+      _toggleExpanded: function(e) {
+        var currentTarget = Polymer.dom(e.currentTarget);
+        var card = currentTarget.node.closest('.card');
+        var scaffold = card.querySelector('tf-chart-scaffold');
+        card.classList.toggle('card-expanded');
+        scaffold.chart().redraw();
+      },
+      _count: function(tags) {
+        if (!this.repeatForRuns) {
+          return tags.length;
+        }
+
+        var targetTags = d3.set(tags);
+        var count = 0;
+        this.selectedRuns.forEach(function(r) {
+          this.run2tag[r].forEach(function(t) {
+            if (targetTags.has(t)) {
+              count++;
+            }
+          });
+        }.bind(this));
+        return count;
+      },
+      _categoryCards: function(category) {
+        var cards = [];
+        category.tags.forEach(function(tag) {
+          var runs = this.selectedRuns.filter(function(r) {
+            return this.run2tag[r] && this.run2tag[r].indexOf(tag) !== -1;
+          }.bind(this));
+
+          if (this.repeatForRuns) {
+            runs.forEach(function(run) {
+              cards.push({tag: tag, run: run, runs: [run]});
+            });
+          } else {
+            cards.push({tag: tag, runs: runs});
+          }
+        }.bind(this));
+
+        return cards;
+      },
+      _titleBorderColor: function(run) {
+        return this.repeatForRuns ? this.colorScale.scale(run) : 'white';
+      },
+
+      /*
+       * Polymer data binding forwarding functions. Check the
+       * Polymer.Templatizer documentation for more information.
+       */
+
+      _forwardParentProp: function(property, value) {
+        this._stampedTemplates.forEach(function(instance) {
+          instance[property] = value;
+        });
+      },
+      _forwardParentPath: function(path, value) {
+        this._stampedTemplates.forEach(function(instance) {
+          instance.notifyPath(path, value, true);
+        });
+      },
+      // TODO(renatoutsch): implement the instance forwarding for two-way data
+      // binding.
+      // Add breaks to input so it will wrap nicely
+      _break: function(ipt) {
+        return ipt.replace(/([\/_-])/g, "$1<wbr>")
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group-demo.html
new file mode 100644
index 0000000000000000000000000000000000000000..3565fec17912437897ec6b3ec509d48fed10645a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group-demo.html
@@ -0,0 +1,45 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+ <head>
+   <link rel="import" href="tf-regex-group.html">
+ </head>
+ <body>
+  <style>
+  .container {
+    width: 255px;
+    padding: 10px;
+    border: 1px solid #3f51b5;
+    border-radius: 5px;
+  }
+  :host {
+    margin: 0px;
+  }
+  </style>
+  <template id="page-template" is="dom-bind">
+    <div class="container">
+      <tf-regex-group regexes="{{regexes}}" id="demo"></tf-regex-group>
+    </div>
+    <p> Regexes:</p>
+    <template is="dom-repeat" items="[[regexes]]">
+      <p>"<span>[[item]]</span>"</p>
+    </template>
+  </template>
+ </body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.html
new file mode 100644
index 0000000000000000000000000000000000000000..e68b306ee33b5e57a1125c7cd9d1b687ae16202e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.html
@@ -0,0 +1,99 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../iron-icons/iron-icons.html">
+<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
+<link rel="import" href="../paper-input/paper-input.html">
+<link rel="import" href="../tf-storage/tf-storage.html">
+
+<!--
+`tf-regex-group` provides an input component for a group of regular expressions.
+
+Example:
+  <tf-regex-group regexes="{{regexes}}"></tf-regex-group>
+
+It contains a series of regular expression input fields. From this, it computes
+`regexes', an array in which every element is either a string representing a
+valid, nonempty regular expression, or the value `null`
+
+Public Properties:
+`regexes` a readonly, notifying array of strings, where each string is a regex
+
+It maintains an invariant that the final regex should always be an empty string,
+so the user can easily add more regular expressions. It does this by adding
+a new empty regex when the final one is nonempty.
+
+Pressing "enter" moves focus to the next regex (or just blurs if there are no
+more regexes).
+-->
+<dom-module id="tf-regex-group">
+  <template>
+    <div class="regex-list">
+      <template is="dom-repeat" items="{{rawRegexes}}">
+        <div class="regex-line">
+          <paper-input
+            id="text-input"
+            class="regex-input"
+            label="Write a regex to create a tag group"
+            no-label-float
+            value="{{item.regex}}"
+            invalid="[[!item.valid]]"
+            on-keyup="moveFocus"
+          ></paper-input>
+          <paper-icon-button
+            icon="close"
+            class="delete-button"
+            aria-label="Delete Regex"
+            tabindex="0"
+            on-tap="deleteRegex"
+          ></paper-icon-button>
+        </div>
+        <style>
+          .regex-input {
+            width: 250px;
+            display: inline-block;
+            margin-left: -3px;
+          }
+
+          .delete-button {
+            color: var(--paper-grey-700);
+            width: 40px;
+            height: 40px;
+            margin-right: -10px;
+          }
+
+          .regex-list {
+            margin-bottom: 10px;
+          }
+
+          paper-input {
+            --paper-input-container-focus-color: var(--tb-orange-strong);
+            --paper-input-container-input: {
+              font-size: 14px;
+            };
+            --paper-input-container-label: {
+              font-size: 14px;
+            };
+          }
+        </style>
+      </template>
+    </div>
+  </template>
+  <script src="tf-regex-group-bundle.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.ts
new file mode 100644
index 0000000000000000000000000000000000000000..92a0eb6a0b9d0738369ff89356e3c49336e2fb27
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.ts
@@ -0,0 +1,86 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as storage from '../tf-storage/storage';
+
+Polymer({
+  is: 'tf-regex-group',
+  properties: {
+    rawRegexes: {
+      type: Array,
+      value: storage.getObjectInitializer(
+          'rawRegexes', [{regex: '', valid: true}]),
+    },
+    regexes:
+        {type: Array, computed: 'usableRegexes(rawRegexes.*)', notify: true},
+  },
+  observers: [
+    'addNewRegexIfNeeded(rawRegexes.*)',
+    'checkValidity(rawRegexes.*)',
+    '_uriStoreRegexes(rawRegexes.*)',
+  ],
+  _uriStoreRegexes:
+      storage.getObjectObserver('rawRegexes', [{regex: '', valid: true}]),
+  checkValidity: function(x) {
+    var match = x.path.match(/rawRegexes\.(\d+)\.regex/);
+    if (match) {
+      var idx = match[1];
+      this.set('rawRegexes.' + idx + '.valid', this.isValid(x.value));
+    }
+  },
+  isValid: function(s) {
+    try {
+      new RegExp(s);
+      return true;
+    } catch (e) {
+      return false;
+    }
+  },
+  usableRegexes: function(regexes) {
+    var isValid = this.isValid;
+    return regexes.base
+        .filter(function(r) {
+          // Checking validity here (rather than using the data property)
+          // is necessary because otherwise we might send invalid regexes due
+          // to the fact that this function can call before the observer does
+          return r.regex !== '' && isValid(r.regex);
+        })
+        .map(function(r) {
+          return r.regex;
+        });
+  },
+  addNewRegexIfNeeded: function() {
+    var last = this.rawRegexes[this.rawRegexes.length - 1];
+    if (last.regex !== '') {
+      this.push('rawRegexes', {regex: '', valid: true});
+    }
+  },
+  deleteRegex: function(e) {
+    if (this.rawRegexes.length > 1) {
+      this.splice('rawRegexes', e.model.index, 1);
+    }
+  },
+  moveFocus: function(e) {
+    if (e.keyCode === 13) {
+      var idx = e.model.index;
+      var inputs = Polymer.dom(this.root).querySelectorAll('.regex-input');
+      if (idx < this.rawRegexes.length - 1) {
+        (inputs[idx + 1] as any).$.input.focus();
+      } else {
+        (document.activeElement as HTMLElement).blur();
+      }
+    }
+  }
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-run-selector.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-run-selector.html
new file mode 100644
index 0000000000000000000000000000000000000000..e3d8a91fd0c2e64650ebbac0fcb6448ffadc9f52
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-run-selector.html
@@ -0,0 +1,188 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-button/paper-button.html">
+<link rel="import" href="../paper-dialog/paper-dialog.html">
+<link rel="import" href="tf-multi-checkbox.html">
+<link rel="import" href="scrollbar-style.html">
+
+<!--
+tf-run-selector creates a set of checkboxes to display which runs are selected.
+It also displays tooltips.
+
+Properties in:
+- runs: Array of strings representing the runs that may be selected
+- colorScale: a TF.ColorScale mapping run names to colors
+
+Properties out:
+- outSelected: The array of run names that are currently checked by the user.
+
+-->
+<dom-module id="tf-run-selector">
+  <template>
+    <paper-dialog with-backdrop id="logdir-dialog">
+      <h2>logdir</h2>
+      <div inner-h-t-m-l="{{_breakString(logdir)}}"></div>
+    </paper-dialog>
+    <div id="top-text">
+      <h3 id="tooltip-help" class="tooltip-container">
+        Runs
+      </h3>
+    </div>
+    <tf-multi-checkbox
+      id="multiCheckbox"
+      names="[[runs]]"
+      out-selected="{{outSelected}}"
+      color-scale="[[colorScale]]"
+    ></tf-multi-checkbox>
+    <paper-button
+      class="x-button"
+      id="toggle-all"
+      on-tap="_toggleAll"
+    >
+    Toggle All Runs
+    </paper-button>
+    <template
+      is="dom-if"
+      if="[[logdir]]">
+      <div id="logdir">
+        <span id="clipped-logdir" inner-h-t-m-l="[[_clippedLogdir]]"></span><!--
+          We use HTML comments to remove spaces before the ellipsis.
+        --><template
+                     is="dom-if"
+                     if="[[_shouldShowExpandLogdirButton(logdir, _logdirClipLength)]]"><!--
+          --><a href="" on-click="_openLogdirDialog">…</a>
+        </template>
+      </div>
+    </template>
+    <style>
+      :host {
+        display: flex;
+        flex-direction: column;
+        padding-bottom: 10px;
+        box-sizing: border-box;
+      }
+      #top-text {
+        width: 100%;
+        flex-grow: 0;
+        flex-shrink: 0;
+        padding-right: 16px;
+        box-sizing: border-box;
+        color: var(--paper-grey-800);
+      }
+      tf-multi-checkbox {
+        display: flex;
+        flex-grow: 1;
+        flex-shrink: 1;
+      }
+      .x-button {
+        font-size: 13px;
+        background-color: var(--tb-ui-light-accent);
+        color: var(--tb-ui-dark-accent);
+      }
+      #tooltip-help {
+        color: var(--paper-grey-800);
+        margin: 0;
+        font-weight: normal;
+        font-size: 14px;
+        margin-bottom: 5px;
+      }
+      paper-button {
+        margin-left: 0;
+      }
+      #logdir {
+        color: var(--tb-ui-dark-accent);
+        font-size: 13px;
+        margin: 5px 0 0 0;
+        max-width: 288px;
+      }
+    </style>
+  </template>
+  <script>
+  Polymer({
+    is: "tf-run-selector",
+    properties: {
+      backend: Object,
+      outSelected: {type: Array, notify: true},
+      // runs: an array of strings, representing the run names that may be chosen
+      runs: Array,
+      colorScale: Object, // TF.ColorScale
+      logdir: {
+        type: String,
+        notify: true,
+      },
+      // This is the potentially clipped portion of the logdir we show at the bottom of the sidebar.
+      _clippedLogdir: {
+        type: String,
+      },
+      _logdirClipLength: {
+        type: Number,
+        value: 250,
+        readOnly: true,
+      },
+    },
+    observers: [
+      "_onBackendUpdate(backend)",
+      "_logdirSet(logdir)",
+    ],
+    _toggleAll: function() {
+      this.$.multiCheckbox.toggleAll();
+    },
+    // Break the string at natural points, including commas, equals, and slashes
+    _breakString: function(originalString) {
+      return originalString.replace(/([\/=\-_,])/g, "$1<wbr>");
+    },
+    _onBackendUpdate: function(backend) {
+      if (backend === undefined) {
+        return;
+      }
+
+      // When the backend is set, the selector can request the logdir.
+      backend.logdir().then(logdirObject => {
+        this.set('logdir', logdirObject.logdir);
+      }).catch(e => {
+        // Fetching the logdir failed. Prevent the exception from logging to
+        // console. The console already logs a 404 network event.
+      });
+    },
+    _logdirSet: function(logdir) {
+      if (logdir === undefined) {
+        // The logdir has not been set yet.
+        return;
+      }
+
+      var lineBrokenText;
+      if (logdir.length > this._logdirClipLength) {
+        // Clip the logdir to avoid blocking the runs selector. Let the user view a more full
+        // version of the logdir.
+        lineBrokenText = this._breakString(logdir.substring(0, this._logdirClipLength));
+      } else {
+        lineBrokenText = this._breakString(logdir);
+      }
+      this.set('_clippedLogdir', lineBrokenText);
+    },
+    _openLogdirDialog: function(event) {
+      event.preventDefault();
+      this.$$('#logdir-dialog').open();
+    },
+    _shouldShowExpandLogdirButton(logdir, _logdirClipLength) {
+      return logdir && logdir.length > _logdirClipLength;
+    },
+  });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-sidebar-helper.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-sidebar-helper.html
new file mode 100644
index 0000000000000000000000000000000000000000..5eb8537040ccef6e8fa76f31c80b85dea795dfdd
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-sidebar-helper.html
@@ -0,0 +1,165 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="tf-categorizer.html">
+<link rel="import" href="tf-run-selector.html">
+
+<!--
+tf-sidebar-helper is a component that renders a sidebar for configuration
+components, like the tf-categorizer and the tf-run-selector. The component can
+also be extended with more options useful to the dashboards.
+
+To use it, create the tf-sidebar-helper with the required properties. To extend
+it with extra configuration components, add them to the element's component:
+
+<tf-sidebar-helper
+  backend: "[[backend]]",
+  categories: "{{outputCategories}}",
+  colorScale: "[[colorScale]]",
+  run2tag: "[[run2tag]]",
+  runs: "[[runs]]",
+  selectedRuns: "{{outSelectedRuns}}",
+  >
+  <div class="extend-first-section">
+    <my options>
+  </div>
+  <div class="sidebar-section">
+    <my options>
+  </div>
+  ...
+</tf-sidebar-helper>
+
+Elements inside the .extend-first-section div will be put on the first section
+of the sidebar, while the rest of the divs will be put after it and before the
+tf-run-selector.
+
+@element tf-sidebar-helper
+-->
+<dom-module id="tf-sidebar-helper">
+  <template>
+    <div class="sidebar-section">
+      <tf-categorizer
+        id="categorizer"
+        tags="[[tags]]"
+        categories="{{categories}}"
+        ></tf-categorizer>
+      <content select=".extend-first-section"></content>
+    </div>
+    <content></content>
+    <div class="sidebar-section">
+      <tf-run-selector
+        id="runSelector"
+        backend="[[backend]]"
+        runs="[[runs]]"
+        color-scale="[[colorScale]]"
+        out-selected="{{selectedRuns}}"
+        ></tf-run-selector>
+    </div>
+    <style include="dashboard-style"></style>
+    <style>
+      :host {
+        display: flex;
+        flex-direction: column;
+        height: 100%;
+      }
+
+      #categorizer {
+        flex-shrink: 0;
+      }
+
+      #runSelector {
+        flex-shrink: 1;
+        flex-grow: 1;
+      }
+
+      .sidebar-section {
+        border-top: solid 1px rgba(0, 0, 0, 0.12);
+        padding: 20px 0px 20px 30px;
+      }
+
+      .sidebar-section:first-child {
+        border: none;
+      }
+
+      .sidebar-section:last-child {
+        flex-grow: 1;
+        display: flex;
+      }
+
+      paper-checkbox {
+        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
+        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
+        font-size: 14px;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-sidebar-helper",
+      properties: {
+        /**
+         * The backend object used to issue requests.
+         */
+        backend: Object,
+
+        /**
+         * This is an output of the categories that the user selected to
+         * separate the different tags. Each category here should be given its
+         * own collapsible pane.
+         */
+        categories: {
+          type: Array,
+          notify: true,
+        },
+
+        /**
+         * Input of the colors that are used for the user's runs.
+         */
+        colorScale: Object,
+
+        /**
+         * Map from runs to the valid tags that have them.
+         */
+        run2tag: Object,
+
+        /**
+         * Input of all valid runs that can be selected by the user.
+         */
+        runs: Array,
+
+        /**
+         * Outputs an array with the runs that are selected by the user (i.e.
+         * valid to be displayed).
+         */
+        selectedRuns: {
+          type: Array,
+          notify: true,
+        },
+
+        tags: {
+          type: Array,
+          computed: "_getTags(run2tag.*)"
+        },
+      },
+      _getTags: function() {
+        return _.union.apply(null, _.values(this.run2tag));
+      },
+    })
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bff6ed6d7d84ceb141410282f19934cdf86ac2f7
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD
@@ -0,0 +1,63 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_distribution_dashboard",
+    srcs = [
+        "tf-distribution-dashboard.html",
+    ],
+    path = "/tf-distribution-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/vz_distribution_chart",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-distribution-dashboard.html",
+        ":legacy_ts",
+    ],
+    destdir = "tf-distribution-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//tensorflow/tensorboard/components/tf_backend:legacy",
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/vz_distribution_chart:legacy",
+        "//third_party/javascript/polymer/v1/iron-collapse:lib",
+        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+        "//third_party/javascript/polymer/v1/paper-styles:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+# This is needed: components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = ["//tensorflow/tensorboard/components:common_deps"],
+)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e512d1a5e2fc9e305ee3913ace0e6b1f7b7a20ac
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_distribution_dashboard/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-distribution-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a4055f34e334a1165579e2be6854fd0fe8dcb0ac
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "data",
+    srcs = glob(["*"]),
+    path = "/tf-distribution-dashboard/demo/data",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ace02adfba51f3397e52f5d4826f74129ffc9fce
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_distribution_dashboard_d3v4",
+    srcs = ["tf-distribution-dashboard.html"],
+    path = "/tf-distribution-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/vz_distribution_chart_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-distribution-dashboard",
+    deps = [
+        ":tf_distribution_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run1_tag_histo1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6765285b14c1c12692b5d9346b71a46e1b7d515
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run1_tag_histo1.json
@@ -0,0 +1,212 @@
+[
+    [
+        0.0,
+        0,
+        [
+            [
+                0,
+                -2.3150592308536755
+            ],
+            [
+                668,
+                -2.0967547155036605
+            ],
+            [
+                1587,
+                -1.4326244423655616
+            ],
+            [
+                3085,
+                -0.8871306575801902
+            ],
+            [
+                5000,
+                -0.09312398815580714
+            ],
+            [
+                6915,
+                0.2584093405812282
+            ],
+            [
+                8413,
+                0.8895470642005087
+            ],
+            [
+                9332,
+                1.3198979614453679
+            ],
+            [
+                10000,
+                1.6793308878855118
+            ]
+        ]
+    ],
+    [
+        100.0,
+        10,
+        [
+            [
+                0,
+                -1.3417572789138936
+            ],
+            [
+                668,
+                -1.183563374619141
+            ],
+            [
+                1587,
+                -0.48920418783271574
+            ],
+            [
+                3085,
+                0.29326906896076954
+            ],
+            [
+                5000,
+                0.56953784145381
+            ],
+            [
+                6915,
+                0.8684655583499333
+            ],
+            [
+                8413,
+                1.4133127368907181
+            ],
+            [
+                9332,
+                1.906140650457873
+            ],
+            [
+                10000,
+                2.135771998171255
+            ]
+        ]
+    ],
+    [
+        200.0,
+        20,
+        [
+            [
+                0,
+                -1.5066917525035333
+            ],
+            [
+                668,
+                -1.3910909571770793
+            ],
+            [
+                1587,
+                -0.902737218885874
+            ],
+            [
+                3085,
+                -0.3807791904765027
+            ],
+            [
+                5000,
+                0.38900200905253046
+            ],
+            [
+                6915,
+                0.8209734209339482
+            ],
+            [
+                8413,
+                1.302385856695965
+            ],
+            [
+                9332,
+                1.9324626053521639
+            ],
+            [
+                10000,
+                2.957505317875451
+            ]
+        ]
+    ],
+    [
+        300.0,
+        30,
+        [
+            [
+                0,
+                -0.5430457051469562
+            ],
+            [
+                668,
+                -0.4626161834245273
+            ],
+            [
+                1587,
+                0.21573949543027715
+            ],
+            [
+                3085,
+                0.37353741100174215
+            ],
+            [
+                5000,
+                0.6891407881591103
+            ],
+            [
+                6915,
+                1.0927156232630852
+            ],
+            [
+                8413,
+                1.2745337159550916
+            ],
+            [
+                9332,
+                1.4321116832891605
+            ],
+            [
+                10000,
+                2.1913774993059034
+            ]
+        ]
+    ],
+    [
+        400.0,
+        40,
+        [
+            [
+                0,
+                -0.3584790755077172
+            ],
+            [
+                668,
+                -0.33301611509753215
+            ],
+            [
+                1587,
+                -0.1089466072951948
+            ],
+            [
+                3085,
+                0.5792199847585249
+            ],
+            [
+                5000,
+                1.220854943811942
+            ],
+            [
+                6915,
+                1.759829438421432
+            ],
+            [
+                8413,
+                2.3072559906741614
+            ],
+            [
+                9332,
+                2.753036118353921
+            ],
+            [
+                10000,
+                3.0267252195784047
+            ]
+        ]
+    ]
+]
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e8a55b3f20739bb81cafd8314721c16fda09378
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo1.json
@@ -0,0 +1,212 @@
+[
+    [
+        0.0,
+        0,
+        [
+            [
+                0,
+                -3.6801669545044846
+            ],
+            [
+                668,
+                -3.192188140974744
+            ],
+            [
+                1587,
+                -2.3414678549368806
+            ],
+            [
+                3085,
+                -0.9632173471995873
+            ],
+            [
+                5000,
+                -0.3214892636797772
+            ],
+            [
+                6915,
+                0.11870794142185205
+            ],
+            [
+                8413,
+                0.8895470642005087
+            ],
+            [
+                9332,
+                1.183563374619141
+            ],
+            [
+                10000,
+                2.665663810418372
+            ]
+        ]
+    ],
+    [
+        100.0,
+        10,
+        [
+            [
+                0,
+                -3.564793583751807
+            ],
+            [
+                668,
+                -3.376844436865802
+            ],
+            [
+                1587,
+                -1.0366615731293798
+            ],
+            [
+                3085,
+                -0.27318696312672563
+            ],
+            [
+                5000,
+                0.9718642422053263
+            ],
+            [
+                6915,
+                2.5765662807928194
+            ],
+            [
+                8413,
+                3.1415385101545126
+            ],
+            [
+                9332,
+                4.085981768607621
+            ],
+            [
+                10000,
+                4.623079406808927
+            ]
+        ]
+    ],
+    [
+        200.0,
+        20,
+        [
+            [
+                0,
+                -2.235172510433281
+            ],
+            [
+                668,
+                -2.004569042815611
+            ],
+            [
+                1587,
+                -1.2015432383370985
+            ],
+            [
+                3085,
+                0.11835464933202625
+            ],
+            [
+                5000,
+                0.56953784145381
+            ],
+            [
+                6915,
+                1.202844810963146
+            ],
+            [
+                8413,
+                2.689066032283515
+            ],
+            [
+                9332,
+                2.8494015726499944
+            ],
+            [
+                10000,
+                3.481377676013788
+            ]
+        ]
+    ],
+    [
+        300.0,
+        30,
+        [
+            [
+                0,
+                -3.360113978269659
+            ],
+            [
+                668,
+                -2.8293185004961043
+            ],
+            [
+                1587,
+                -1.5992540502266783
+            ],
+            [
+                3085,
+                0.14393860259807117
+            ],
+            [
+                5000,
+                1.47723448201245
+            ],
+            [
+                6915,
+                1.9510057389110733
+            ],
+            [
+                8413,
+                2.833176104473626
+            ],
+            [
+                9332,
+                4.142405216576347
+            ],
+            [
+                10000,
+                4.706937777668589
+            ]
+        ]
+    ],
+    [
+        400.0,
+        40,
+        [
+            [
+                0,
+                -2.599286228987632
+            ],
+            [
+                668,
+                -2.240365897443259
+            ],
+            [
+                1587,
+                -1.5992540502266783
+            ],
+            [
+                3085,
+                -0.9101893288861387
+            ],
+            [
+                5000,
+                0.7580548669750213
+            ],
+            [
+                6915,
+                1.6009864433919474
+            ],
+            [
+                8413,
+                2.3504002974280036
+            ],
+            [
+                9332,
+                2.7907805263353733
+            ],
+            [
+                10000,
+                3.5098048900144323
+            ]
+        ]
+    ]
+]
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c8836f6246306cbf162d4c1299d3eff075185b6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo2.json
@@ -0,0 +1,212 @@
+[
+    [
+        0.0,
+        0,
+        [
+            [
+                0,
+                -1.9291158122759586
+            ],
+            [
+                668,
+                -1.5970765333488954
+            ],
+            [
+                1587,
+                -1.0923120348519078
+            ],
+            [
+                3085,
+                -0.6688082872192093
+            ],
+            [
+                5000,
+                0.09312398815580714
+            ],
+            [
+                6915,
+                0.44532789251701854
+            ],
+            [
+                8413,
+                0.8238009655877649
+            ],
+            [
+                9332,
+                1.0357232383581656
+            ],
+            [
+                10000,
+                1.2741043689144438
+            ]
+        ]
+    ],
+    [
+        100.0,
+        10,
+        [
+            [
+                0,
+                -0.7780725642449806
+            ],
+            [
+                668,
+                -0.7138496178727424
+            ],
+            [
+                1587,
+                -0.5448932415735014
+            ],
+            [
+                3085,
+                -0.24370397454796228
+            ],
+            [
+                5000,
+                0.42790220995778355
+            ],
+            [
+                6915,
+                0.6191730643365096
+            ],
+            [
+                8413,
+                0.752059342118037
+            ],
+            [
+                9332,
+                1.0451472255274825
+            ],
+            [
+                10000,
+                2.5559479569222825
+            ]
+        ]
+    ],
+    [
+        200.0,
+        20,
+        [
+            [
+                0,
+                -1.3876904425996377
+            ],
+            [
+                668,
+                -1.1464188862638496
+            ],
+            [
+                1587,
+                -0.4049955219067526
+            ],
+            [
+                3085,
+                0.04721394862139682
+            ],
+            [
+                5000,
+                0.56953784145381
+            ],
+            [
+                6915,
+                1.3221859041483333
+            ],
+            [
+                8413,
+                1.6188495656305735
+            ],
+            [
+                9332,
+                1.7613953069723651
+            ],
+            [
+                10000,
+                2.3257482385477384
+            ]
+        ]
+    ],
+    [
+        300.0,
+        30,
+        [
+            [
+                0,
+                -1.600772629982185
+            ],
+            [
+                668,
+                -1.1548516185367033
+            ],
+            [
+                1587,
+                -0.260387173785447
+            ],
+            [
+                3085,
+                0.17416570914366614
+            ],
+            [
+                5000,
+                0.47069243095356195
+            ],
+            [
+                6915,
+                1.1559276581637614
+            ],
+            [
+                8413,
+                2.0474031182051404
+            ],
+            [
+                9332,
+                2.18821711651116
+            ],
+            [
+                10000,
+                2.2393193406467518
+            ]
+        ]
+    ],
+    [
+        400.0,
+        40,
+        [
+            [
+                0,
+                -0.8286852465281818
+            ],
+            [
+                668,
+                -0.7815041529866706
+            ],
+            [
+                1587,
+                -0.3334896444053469
+            ],
+            [
+                3085,
+                0.21085213041026643
+            ],
+            [
+                5000,
+                0.5177616740489182
+            ],
+            [
+                6915,
+                1.077122434649409
+            ],
+            [
+                8413,
+                1.5898009703967424
+            ],
+            [
+                9332,
+                1.8859097291499742
+            ],
+            [
+                10000,
+                2.0954239138728523
+            ]
+        ]
+    ]
+]
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/logdir b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/logdir
new file mode 100644
index 0000000000000000000000000000000000000000..b6362b45d777266d6204b23884222a080f789f71
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/runs.json b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/runs.json
new file mode 100644
index 0000000000000000000000000000000000000000..739262a9fb62edcdd4d8010410a7713629a0d383
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/runs.json
@@ -0,0 +1,4 @@
+{
+  "run1": {"compressedHistograms": ["histo1"]},
+  "run2": {"compressedHistograms": ["histo2", "histo1"]}
+}
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/index.html b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..5e825f13f5c87f1cc8331575236eaf4deab1a2d8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/index.html
@@ -0,0 +1,62 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../paper-styles/typography.html">
+<link rel="import" href="tf-distribution-dashboard.html">
+
+<title>Distribution Dashboard Demo</title>
+<style>
+  #container {
+    height: 800px;
+    width: 100%;
+    display: block;
+  }
+
+  html, body {
+    margin: 0;
+    padding: 0;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="distribution-dash-demo">
+      <template>
+        <tf-distribution-dashboard id="demo" backend="[[backend]]"></tf-distribution-dashboard>
+      </template>
+      <script>
+        Polymer({
+          is: "distribution-dash-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                var router = new TF.Backend.router("data", true);
+                return new TF.Backend.Backend(router);
+              },
+            },
+          },
+        });
+      </script>
+    </dom-module>
+    <distribution-dash-demo id="container"></distribution-dash-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/tf-distribution-dashboard.html b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/tf-distribution-dashboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..063bd8d0993fef9d4121389cddcf0b314516cf29
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/tf-distribution-dashboard.html
@@ -0,0 +1,124 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+<link rel="import" href="../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
+<link rel="import" href="../tf-dashboard-common/tf-option-selector.html">
+<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
+<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../vz-distribution-chart/vz-distribution-chart.html">
+<link rel="import" href="../iron-collapse/iron-collapse.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+
+<!--
+tf-distribution-dashboard is a complete frontend that loads runs from a backend,
+and creates chart panes that display data for those runs.
+
+It provides a x type selector and the normal tf-sidebar-helper options, by
+which the user can customize how data is organized and displayed.
+
+Each chart has a button that can toggle whether it is "expanded"; expanded
+charts are larger.
+
+Organizationally, the #plumbing div contains components that have no concrete
+manifestation and just effect data bindings or data loading. The .sidebar div
+contains shared controls provided by tf-sidebar-helper. The .center div
+contains vz-distribution-charts embedded inside tf-panes-helper's.
+-->
+<dom-module id="tf-distribution-dashboard">
+  <template>
+    <div id="plumbing">
+      <tf-color-scale
+        id="colorScale"
+        runs="[[runs]]"
+        out-color-scale="{{_colorScale}}"
+      ></tf-color-scale>
+    </div>
+
+    <tf-dashboard-layout>
+      <div class="sidebar">
+        <tf-sidebar-helper
+          backend="[[backend]]"
+          categories="{{_categories}}"
+          color-scale="[[_colorScale]]"
+          run2tag="[[run2tag]]"
+          runs="[[runs]]"
+          selected-runs="{{_selectedRuns}}"
+          >
+        <div class="sidebar-section">
+          <tf-option-selector
+            id="xTypeSelector"
+            name="Horizontal Axis"
+            selected-id="{{_xType}}"
+            >
+            <paper-button id="step">step</paper-button>
+            <paper-button id="relative">relative</paper-button>
+            <paper-button id="wall_time">wall</paper-button>
+          </tf-option-selector>
+        </div>
+        </tf-sidebar-helper>
+      </div>
+
+      <div class="center">
+        <tf-panes-helper
+          categories="[[_categories]]"
+          color-scale="[[_colorScale]]"
+          data-type="[[dataType]]"
+          data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
+          run2tag="[[run2tag]]"
+          selected-runs="[[_selectedRuns]]"
+          repeat-for-runs
+          >
+          <template>
+            <vz-distribution-chart
+              x-type="[[_xType]]"
+              color-scale="[[_colorScale]]"
+              ></vz-distribution-chart>
+          </template>
+        </tf-panes-helper>
+      </div>
+    </tf-dashboard-layout>
+
+    <style include="dashboard-style"></style>
+  </template>
+
+  <script>
+    TF.Dashboard.TfDistributionDashboard = Polymer({
+      is: "tf-distribution-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
+      behaviors: [
+        TF.Dashboard.DashboardBehavior("distributions"),
+        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
+        TF.Backend.BackendBehavior,
+      ],
+      properties: {
+        backend: Object,
+        _xType: {
+          type: String,
+          value: "step"
+        },
+        dataType: {value: "compressedHistogram"},
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_globals/BUILD b/tensorflow/tensorboard/components/tf_globals/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7e81163e8010d55a4717d030196f055859bc4b2a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_globals/BUILD
@@ -0,0 +1,49 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# TODO(dandelion): Add webfiles support for the test code.
+
+web_library(
+    name = "tf_globals",
+    srcs = [
+        "tf-globals.html",
+        ":ts",
+    ],
+    path = "/tf-globals",
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["globals.ts"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-globals.html",
+        ":legacy_ts",
+    ],
+    destdir = "tf-globals",
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = ["globals.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_globals_d3v4/BUILD b/tensorflow/tensorboard/components/tf_globals_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..699cd3239b76a7790d1f929c68fd40ede181d6a7
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_globals_d3v4/BUILD
@@ -0,0 +1,33 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_globals_d3v4",
+    srcs = [
+        "bundle.js",
+        "tf-globals.html",
+    ],
+    path = "/tf-globals",
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.Globals": ["globals.ts"]},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/go/session.cpp b/tensorflow/tensorboard/components/tf_globals_d3v4/tf-globals.html
similarity index 65%
rename from tensorflow/go/session.cpp
rename to tensorflow/tensorboard/components/tf_globals_d3v4/tf-globals.html
index efa225505b8fc84ddda06177991b74aa0c74a348..b0fd74d4f20b680e2d55b3de4ed51a1d35a39882 100644
--- a/tensorflow/go/session.cpp
+++ b/tensorflow/tensorboard/components/tf_globals_d3v4/tf-globals.html
@@ -1,5 +1,6 @@
-/*
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,15 +13,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-*/
+-->
 
-// TODO(ashankar): Remove this file when TensorFlow 1.1 is released.
-// See lib.go for details.
+<script src="bundle.js"></script>
 
-extern "C" {
-extern void tfDeletePRunHandle(const char* h);
-}
-
-void tfDeletePRunHandle(const char* h) {
-  delete[] h;
-}
diff --git a/tensorflow/tensorboard/components/tf_graph/BUILD b/tensorflow/tensorboard/components/tf_graph/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f950d1a5324de9f32c137a23c74f272a0ace3511
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph/BUILD
@@ -0,0 +1,60 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph",
+    srcs = [
+        "tf-graph.html",
+        "tf-graph-minimap.html",
+        "tf-graph-scene.html",
+    ],
+    path = "/tf-graph",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "@org_polymer",
+        "@org_polymer_iron_flex_layout",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_radio_group",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph.html",
+        "tf-graph-minimap.html",
+        "tf-graph-scene.html",
+    ],
+    destdir = "tf-graph",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/BUILD b/tensorflow/tensorboard/components/tf_graph/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..524d0ff7679a40b470502b028fca3b76c761108f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/demo_datasets.json b/tensorflow/tensorboard/components/tf_graph/demo/demo_datasets.json
deleted file mode 100644
index f5ca9aada79aee9929facd68b4737ce58de35378..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/demo_datasets.json
+++ /dev/null
@@ -1,123 +0,0 @@
-[
-  {
-    "name": "Mnist Eval",
-    "path": "mnist_eval.pbtxt"
-  },
-  {
-    "name": "Mnist with summaries (+stats)",
-    "path": "mnist_with_summaries.pbtxt",
-    "runMetadata": [
-      {
-        "tag": "step100",
-        "path": "mnist_with_summaries_step100.pbtxt"
-      },
-      {
-        "tag": "step1000",
-        "path": "mnist_with_summaries_step1000.pbtxt"
-      }
-    ]
-  },
-  {
-    "name": "Mnist Train (with shapes)",
-    "path": "mnist_train_shapes.pbtxt"
-  },
-  {
-    "name": "Inception Train (huge)",
-    "path": "inception_train.pbtxt"
-  },
-  {
-    "name": "Inception Train Eval",
-    "path": "inception_train_eval.pbtxt"
-  },
-  {
-    "name": "Inception Test",
-    "path": "inception_test_eval.pbtxt"
-  },
-  {
-    "name": "PTB Word LSTM Train",
-    "path": "ptb_word_lstm_train.pbtxt"
-  },
-  {
-    "name": "PTB Word LSTM Train Eval",
-    "path": "ptb_word_lstm_train_eval.pbtxt"
-  },
-  {
-    "name": "PTB Word LSTM Test",
-    "path": "ptb_word_lstm_test_eval.pbtxt"
-  },
-  {
-    "name": "Cifar10 Train (+stats)",
-    "path": "cifar10_train.pbtxt",
-    "runMetadata": [
-      {
-        "tag": "step0",
-        "path": "cifar10_train_step0.pbtxt"
-      },
-      {
-        "tag": "step100",
-        "path": "cifar10_train_step100.pbtxt"
-      },
-      {
-        "tag": "step200",
-        "path": "cifar10_train_step200.pbtxt"
-      },
-      {
-        "tag": "step300",
-        "path": "cifar10_train_step300.pbtxt"
-      }
-    ]
-  },
-  {
-    "name": "Cifar10 Multi-GPU Train",
-    "path": "cifar10_multi_gpu_train.pbtxt"
-  },
-  {
-    "name": "Cifar10 Eval (+stats)",
-    "path": "cifar10_eval.pbtxt",
-    "runMetadata": [
-      {
-        "tag": "step0",
-        "path": "cifar10_eval_step0.pbtxt"
-      },
-      {
-        "tag": "step10",
-        "path": "cifar10_eval_step10.pbtxt"
-      },
-      {
-        "tag": "step20",
-        "path": "cifar10_eval_step20.pbtxt"
-      }
-    ]
-  },
-  {
-    "name": "Fatcat LSTM",
-    "path": "fatcat_lstm.pbtxt"
-  },
-  {
-    "name": "Legacy Inception Renamed",
-    "path": "legacy_inception_renamed.pbtxt"
-  },
-  {
-    "name": "Wolfe (Broken)",
-    "path": "wolfe1.pbtxt"
-  },
-  {
-    "name": "Wolfe (Fixed)",
-    "path": "wolfe2.pbtxt"
-  },
-  {
-    "id": "alex",
-    "name": "AlexNet",
-    "path": "alexnet.pbtxt"
-  },
-  {
-    "id": "alexprivate",
-    "name": "AlexNet Private",
-    "path": "alexnet.pbtxt",
-    "private": true
-  },
-  {
-    "name": "TestError404",
-    "path": "nofile"
-  }
-]
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/index.html b/tensorflow/tensorboard/components/tf_graph/demo/index.html
index c89490f44d429b6fb907a84ff1c6b7228f585ed4..52e2f0b9340950ed5f873cba17c8bbf2aee62e6a 100644
--- a/tensorflow/tensorboard/components/tf_graph/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_graph/demo/index.html
@@ -15,32 +15,78 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
 
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, minimum-scale=1.0, initial-scale=1.0, user-scalable=yes">
-    <title>tf-graph Demo</title>
-    <!-- Libraries that should be imported in TensorBoard when the Graph visualizer ports to TensorBoard -->
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <script src="../../es6-promise/promise.min.js"></script>
-    <link rel="import" href="tf-graph-demo.html">
-    <style>
-      html {
-        width: 100%;
-        height: 100%;
-      }
+        <!-- We color ops in the graph by XLA cluster. -->
+        <tf-graph id="graph" color-by="xla_cluster"></tf-graph>
+      </template>
+      <script>
+        "use strict";
 
-      body {
-        margin: 0;
-        padding: 0;
-        width: 100%;
-        height: 100%;
-      }
-    </style>
-  </head>
+        Polymer({
+          is: "tf-graph-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
 
-  <body unresolved>
-    <tf-graph-demo></tf-graph-demo>
-  </body>
-</html>
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Render the graph.
+              this.$.graph.set('basicGraph', slimGraph);
+              this.$.graph.set('graphHierarchy', graphHierarchy);
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-demo></tf-graph-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/tf-graph-demo.html b/tensorflow/tensorboard/components/tf_graph/demo/tf-graph-demo.html
deleted file mode 100644
index d5fd41dfebeb61471183ab3de8a6cb239d86983b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/tf-graph-demo.html
+++ /dev/null
@@ -1,202 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../../polymer/polymer.html">
-<link rel="import" href="../../tf-graph-board/tf-graph-board.html">
-<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../../tf-graph/tf-graph-controls.html">
-
-<!--
-Element for tf-graph demo page
-
-Example:
-
-<tf-graph-demo></tf-graph-demo>
--->
-<dom-module id="tf-graph-demo">
-<template>
-<style>
-
-:host /deep/ {
-  font-family: 'Roboto', sans-serif;
-}
-
-.main {
-  position: absolute;
-  right: 0;
-  left: 250px;
-  height: 100%;
-}
-
-.side {
-  position: absolute;
-  left: 0;
-  width: 250px;
-  height: 100%;
-  border: 1px solid black;
-  box-sizing: border-box;
-}
-
-.all {
-  position: relative;
-  width: 100%;
-  height: 100%
-}
-
-</style>
-<div class="all">
-  <div class="side">
-    <!-- The observatory header component is injected in during vulcanization
-         and an instance of it is initialized and filled here when the demo
-         app initializes. -->
-    <div id="observatory-header"></div>
-    <tf-graph-controls
-        devices-for-stats="{{_devicesForStats}}"
-        color-by-params="[[colorByParams]]"
-        stats="[[stats]]"
-        color-by="{{colorBy}}"
-        datasets="[[datasets]]"
-        render-hierarchy="[[_renderHierarchy]]"
-        selected-dataset="{{selectedDataset}}"
-        selected-file="{{selectedFile}}"
-        selected-metadata-tag="{{selectedMetadataTag}}"
-        show-session-runs-dropdown="[[showSessionRunsDropdown]]"
-        show-upload-button="[[showUploadButton]]"
-    ></tf-graph-controls>
-    <tf-graph-loader id="loader"
-        datasets="[[datasets]]"
-        selected-dataset="[[selectedDataset]]"
-        selected-metadata-tag="[[selectedMetadataTag]]"
-        selected-file="[[selectedFile]]"
-        out-graph-hierarchy="{{graphHierarchy}}"
-        out-graph="{{graph}}"
-        out-stats="{{stats}}"
-        progress="{{_progress}}"
-        out-hierarchy-params="{{_hierarchyParams}}"
-    ></tf-graph-loader>
-  </div>
-  <div class="main">
-    <tf-graph-board id="graphboard"
-        color-by="[[colorBy]]"
-        color-by-params="{{colorByParams}}"
-        devices-for-stats="[[_devicesForStats]]"
-        graph-hierarchy="[[graphHierarchy]]"
-        graph="[[graph]]"
-        hierarchy-params="[[_hierarchyParams]]"
-        progress="[[_progress]]"
-        render-hierarchy="{{_renderHierarchy}}"
-        stats="[[stats]]"
-    ></tf-graph-board>
-  </div>
-</div>
-</template>
-</dom-module>
-
-<script>
-(function(){
-
-Polymer({
-  is: 'tf-graph-demo',
-  properties: {
-    datasets: {
-      type: Object
-    },
-    selectedDataset: {
-      type: Number,
-      value: 0,
-      observer: '_selectedDatasetChanged'
-    },
-    _renderHierarchy: Object,
-    _progress: Object,
-    showSessionRunsDropdown: {
-      type: Boolean,
-      value: true
-    },
-    showUploadButton: {
-      type: Boolean,
-      value: true
-    }
-  },
-  created: function() {
-    let queryParams = tf.graph.util.getQueryParams(location.search);
-    let selectedDataset = 0;
-
-    if (typeof DEMO_DATASETS === 'undefined') {
-      DEMO_DATASETS = 'demo_datasets.json';
-    }
-
-    d3.json(DEMO_DATASETS, function (error, datasets) {
-        let publicDatasets = [];
-
-        if (error) {
-          console.log('Error loading demo datasets:');
-          console.log(error);
-          return;
-        }
-
-        if(typeof DEMO_DIR_PREFIX === 'undefined') {
-          DEMO_DIR_PREFIX = 'tf_model_zoo/';
-        }
-        _.each(datasets, function(dataset, index) {
-          if (queryParams['graphid'] && dataset.id == queryParams['graphid']) {
-            selectedDataset = index;
-          } else if (dataset['private']) {
-            return;
-          }
-
-          dataset.path = this._normalizePath(dataset.path);
-          if (dataset.runMetadata != null) {
-            _.each(dataset.runMetadata, function(metadata) {
-              metadata.path = this._normalizePath(metadata.path);
-            }, this);
-          }
-          publicDatasets.push(dataset);
-        }, this);
-        this.set('datasets', publicDatasets);
-        if (selectedDataset != 0) {
-          this.set('selectedDataset', selectedDataset);
-        }
-    }.bind(this));
-  },
-  ready: function() {
-    if (typeof IS_OBSERVATORY !== 'undefined' && IS_OBSERVATORY) {
-      // Create the header and add it to the DOM. This component is injected in
-      // during vulcanization.
-      document.getElementById('observatory-header').appendChild(
-          document.createElement('tf-graph-observatory-header'));
-
-      this.set('showSessionRunsDropdown', false);
-      this.set('showUploadButton', false);
-    }
-  },
-  _normalizePath: function(path) {
-    return this.resolveUrl(DEMO_DIR_PREFIX + path);
-  },
-  _selectedDatasetChanged: function() {
-    if (this.datasets) {
-      let dataset = this.datasets[this.selectedDataset];
-      let queryParams = '';
-      if (dataset['id']) {
-        queryParams = '?graphid=' + dataset['id'];
-      }
-      window.history.replaceState(
-          null, null, location.pathname + queryParams);
-    }
-  }
-});
-})();
-</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_app/BUILD b/tensorflow/tensorboard/components/tf_graph_app/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..499bd621c81daceab72df4f4fb8e712973936a56
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app/BUILD
@@ -0,0 +1,59 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_app",
+    srcs = [
+        "index.html",
+        "tf-graph-app.html",
+    ],
+    path = "/tf-graph-app",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board",
+        "//tensorflow/tensorboard/components/tf_graph_controls",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer",
+        "@org_polymer_iron_component_page",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "index.html",
+        "tf-graph-app.html",
+    ],
+    destdir = "tf-graph-app",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_controls:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_loader:legacy",
+        "//third_party/javascript/polymer/v1/iron-list:lib",
+        "//third_party/javascript/polymer/v1/paper-radio-group:lib",
+        "//third_party/javascript/polymer/v1/paper-tooltip:lib",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_app/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..147cb0947c4293edba7f3e2c942f6793dc892418
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app/demo/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_app/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-app/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_app",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_app/demo/data/graph.pbtxt
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_app/demo/graph.pbtxt
rename to tensorflow/tensorboard/components/tf_graph_app/demo/data/graph.pbtxt
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/index.html b/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
index 0897cdd08bd440a18e7a4a7f39e4a53036a5e958..f71feea390a958b447e046e815cb36ec2152a1aa 100644
--- a/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
@@ -15,31 +15,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <link rel="import" href="../tf-graph-app.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-  <style>
-  body {
-    margin: 0;
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<link rel="import" href="../tf-graph-app.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<style>
+  /** Make the graph app tall enough so the bottom legend does not overlap with the top. */
+  tf-graph-app, .container.tf-graph-app {
+    display: block;
+    height: 700px;
   }
-  </style>
-</head>
-<body>
-  <h3>Answer to the Ultimate Question of Life, the Universe, and Everything</h3>
-  <demo-snippet>
-    <template>
-      <tf-graph-app id="tfgraph"></tf-graph-app>
-      <script>
-        let g = document.querySelector("#tfgraph");
-        fetch("graph.pbtxt", {credentials: "include"}).then(r => r.text()).then(pbtxt => {
-          g.pbtxt = pbtxt;
-        });
-      </script>
-    </template>
-  </demo-snippet>
-</body>
-</html>
+</style>
+<h3>Answer to the Ultimate Question of Life, the Universe, and Everything</h3>
+<demo-snippet>
+  <template>
+    <tf-graph-app pbtxt-file-location="data/graph.pbtxt"></tf-graph-app>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html b/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
index 07308d38e41c3ec2d4cd7b6243809f200de56951..915b54a06a9efe5e2bcbd60edcd2021df3304ce3 100644
--- a/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
+++ b/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
@@ -18,7 +18,7 @@ limitations under the License.
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-graph-board/tf-graph-board.html">
 <link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../tf-graph/tf-graph-controls.html">
+<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
 
 <!--
 Stand alone element of tf-graph for embedding.
@@ -111,17 +111,42 @@ Polymer({
   is: 'tf-graph-app',
   properties: {
     stats: Object,
+
+    // To use tf-graph-app, specify one of these 2 properties. Provide either
+    // 1. The path to a pbtxt file to load (pbtxtFileLocation). This option nicely makes the
+    //    progress bar include the time it takes to load the file across the network. The path could
+    //    be either a relative path or an absolute URL (of a resource that supports CORS).
+    // 2. The raw contents of a pbtxt file (pbtxt).
+    // Do not set both of these 2 properties.
+    pbtxtFileLocation: {
+      type: String,
+      observer: '_updateGraph',
+    },
     pbtxt: {
       type: String,
       observer: '_updateGraph',
     },
+
     _renderHierarchy: Object,
-    _progress: Object
+    _progress: Object,
   },
   _updateGraph: function() {
-    var blob = new Blob([this.pbtxt]);
-    this.$.loader._parseAndConstructHierarchicalGraph(null, blob);
-  }
+    if (this.pbtxtFileLocation) {
+      // Fetch a pbtxt file. The fetching will be part of the loading sequence.
+      this.$.loader.datasets = [{
+        // Just name the dataset based on the file location.
+        "name": this.pbtxtFileLocation,
+        "path": this.pbtxtFileLocation,
+      }];
+      this.$.loader.set('selectedDataset', 0);
+    } else if (this.pbtxt) {
+      // Render the provided pbtxt.
+      var blob = new Blob([this.pbtxt]);
+
+      // TODO(chizeng): Find out why we call a private method here and do away with the call.
+      this.$.loader._parseAndConstructHierarchicalGraph(null, blob);
+    }
+  },
 });
 })();
 </script>
diff --git a/tensorflow/tensorboard/components/tf_graph_app_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_app_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6da62438931f58e7f31097547c1536f2ffde676d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/BUILD
@@ -0,0 +1,59 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_app_d3v4",
+    srcs = [
+        "index.html",
+        "tf-graph-app.html",
+    ],
+    path = "/tf-graph-app",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_component_page",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "index.html",
+        "tf-graph-app.html",
+    ],
+    destdir = "tf-graph-app",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_controls:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_loader:legacy",
+        "//third_party/javascript/polymer/v1/iron-list:lib",
+        "//third_party/javascript/polymer/v1/paper-radio-group:lib",
+        "//third_party/javascript/polymer/v1/paper-tooltip:lib",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..147cb0947c4293edba7f3e2c942f6793dc892418
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_app/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-app/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_app",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8b95b258df4806dcf84e3b4c1c14cd0434df8910
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/data/graph.pbtxt
@@ -0,0 +1,90 @@
+node {
+  name: "life"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "universe"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 40
+      }
+    }
+  }
+}
+node {
+  name: "everything"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Add"
+  op: "Add"
+  input: "life"
+  input: "universe"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "answer"
+  op: "Add"
+  input: "Add"
+  input: "everything"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 10
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..f71feea390a958b447e046e815cb36ec2152a1aa
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/index.html
@@ -0,0 +1,34 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<link rel="import" href="../tf-graph-app.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<style>
+  /** Make the graph app tall enough so the bottom legend does not overlap with the top. */
+  tf-graph-app, .container.tf-graph-app {
+    display: block;
+    height: 700px;
+  }
+</style>
+<h3>Answer to the Ultimate Question of Life, the Universe, and Everything</h3>
+<demo-snippet>
+  <template>
+    <tf-graph-app pbtxt-file-location="data/graph.pbtxt"></tf-graph-app>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_app_d3v4/index.html b/tensorflow/tensorboard/components/tf_graph_app_d3v4/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..c80fbf4f632696ba48c424599b7a84eeb77ecead
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/index.html
@@ -0,0 +1,30 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+  <head>
+    <title>vz-vega</title>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <script src="../webcomponentsjs/webcomponents-lite.js"></script>
+    <link rel="import" href="../iron-component-page/iron-component-page.html">
+  </head>
+  <body>
+    <iron-component-page src="tf-graph-app.html"></iron-component-page>
+  </body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_graph_app_d3v4/tf-graph-app.html b/tensorflow/tensorboard/components/tf_graph_app_d3v4/tf-graph-app.html
new file mode 100644
index 0000000000000000000000000000000000000000..915b54a06a9efe5e2bcbd60edcd2021df3304ce3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/tf-graph-app.html
@@ -0,0 +1,152 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-graph-board/tf-graph-board.html">
+<link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
+
+<!--
+Stand alone element of tf-graph for embedding.
+
+The pbtxt format is the stringified version of the graphdef.
+
+    <tf-graph-app pbtxt="[[pbtxt]]"></tf-graph-app>
+
+    import tensorflow as tf
+    life = tf.constant(2, name='life')
+    universe = tf.constant(40, name='universe')
+    everything = tf.constant(0, name='everything')
+    lifeuniverse = tf.add(life, universe)
+    answer = tf.add(lifeuniverse, everything, name='answer')
+    open("graph.pbtxt", "w").write(str(tf.get_default_graph().as_graph_def()))
+
+@demo demo/index.html
+-->
+<dom-module id="tf-graph-app">
+<template>
+<style>
+
+:host /deep/ {
+  font-family: 'Roboto', sans-serif;
+}
+
+.main {
+  position: absolute;
+  right: 0;
+  left: 250px;
+  height: 100%;
+}
+
+.side {
+  position: absolute;
+  left: 0;
+  width: 250px;
+  height: 100%;
+  border: 1px solid black;
+  box-sizing: border-box;
+}
+
+.all {
+  position: relative;
+  width: 100%;
+  height: 100%
+}
+
+.container {
+  height: 650px;
+}
+
+</style>
+<div class="container">
+  <div class="all">
+    <div class="side">
+      <tf-graph-controls
+          color-by-params="[[colorByParams]]"
+          stats="[[stats]]"
+          color-by="{{colorBy}}"
+          render-hierarchy="[[_renderHierarchy]]"
+      ></tf-graph-controls>
+      <tf-graph-loader id="loader"
+          out-graph-hierarchy="{{graphHierarchy}}"
+          out-graph="{{graph}}"
+          out-stats="{{stats}}"
+          progress="{{_progress}}"
+      ></tf-graph-loader>
+    </div>
+    <div class="main">
+      <tf-graph-board id="graphboard"
+          graph-hierarchy="[[graphHierarchy]]"
+          graph="[[graph]]"
+          stats="[[stats]]"
+          progress="[[_progress]]"
+          color-by="[[colorBy]]"
+          color-by-params="{{colorByParams}}"
+          render-hierarchy="{{_renderHierarchy}}"
+      ></tf-graph-board>
+    </div>
+  </div>
+</div>
+</template>
+</dom-module>
+
+<script>
+(function(){
+
+Polymer({
+  is: 'tf-graph-app',
+  properties: {
+    stats: Object,
+
+    // To use tf-graph-app, specify one of these 2 properties. Provide either
+    // 1. The path to a pbtxt file to load (pbtxtFileLocation). This option nicely makes the
+    //    progress bar include the time it takes to load the file across the network. The path could
+    //    be either a relative path or an absolute URL (of a resource that supports CORS).
+    // 2. The raw contents of a pbtxt file (pbtxt).
+    // Do not set both of these 2 properties.
+    pbtxtFileLocation: {
+      type: String,
+      observer: '_updateGraph',
+    },
+    pbtxt: {
+      type: String,
+      observer: '_updateGraph',
+    },
+
+    _renderHierarchy: Object,
+    _progress: Object,
+  },
+  _updateGraph: function() {
+    if (this.pbtxtFileLocation) {
+      // Fetch a pbtxt file. The fetching will be part of the loading sequence.
+      this.$.loader.datasets = [{
+        // Just name the dataset based on the file location.
+        "name": this.pbtxtFileLocation,
+        "path": this.pbtxtFileLocation,
+      }];
+      this.$.loader.set('selectedDataset', 0);
+    } else if (this.pbtxt) {
+      // Render the provided pbtxt.
+      var blob = new Blob([this.pbtxt]);
+
+      // TODO(chizeng): Find out why we call a private method here and do away with the call.
+      this.$.loader._parseAndConstructHierarchicalGraph(null, blob);
+    }
+  },
+});
+})();
+</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_board/BUILD b/tensorflow/tensorboard/components/tf_graph_board/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4358e601f46815a63b67adfe57505b562e74ca9c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board/BUILD
@@ -0,0 +1,51 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_board",
+    srcs = [
+        "tf-graph-board.html",
+    ],
+    path = "/tf-graph-board",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_info",
+        "@org_polymer",
+        "@org_polymer_paper_progress",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-board.html",
+    ],
+    destdir = "tf-graph-board",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_info:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_board/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_board/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2d668769e62076260cbb391f6ecea99dc186878d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_board/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-board/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_board/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_board/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_board/demo/index.html b/tensorflow/tensorboard/components/tf_graph_board/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..2563e1595e9648fafea8d3632ece3af7732bf642
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board/demo/index.html
@@ -0,0 +1,98 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-board.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Board Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+
+  /** Make the graph take up the entire height of the demo container. */
+  tf-graph-board-demo, #board, #board > div {
+    display: block;
+    height: 100%;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-board-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <!-- We color ops in the graph by XLA cluster. -->
+        <tf-graph-board id="board" color-by="xla_cluster"></tf-graph-board>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-board-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Render the graph.
+              this.$.board.set('graph', slimGraph);
+              this.$.board.set('graphHierarchy', graphHierarchy);
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-board-demo></tf-graph-board-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html b/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
index 5909172fbe5ebf98cc0cb215a3548c3507c2e447..0ee694e1e6638f7ed8808f5d11a5c92d9ae6673f 100644
--- a/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
+++ b/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
@@ -17,6 +17,7 @@ limitations under the License.
 
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-graph/tf-graph.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
 <link rel="import" href="../tf-graph-info/tf-graph-info.html">
 <link rel="import" href="../paper-progress/paper-progress.html">
 
diff --git a/tensorflow/tensorboard/components/tf_graph_board_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_board_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7203a9333b30592be26f93ba8f6f399ec257d381
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/BUILD
@@ -0,0 +1,28 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_board_d3v4",
+    srcs = [
+        "tf-graph-board.html",
+    ],
+    path = "/tf-graph-board",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_info_d3v4",
+        "@org_polymer",
+        "@org_polymer_paper_progress",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2d668769e62076260cbb391f6ecea99dc186878d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_board/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-board/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..2563e1595e9648fafea8d3632ece3af7732bf642
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/index.html
@@ -0,0 +1,98 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-board.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Board Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+
+  /** Make the graph take up the entire height of the demo container. */
+  tf-graph-board-demo, #board, #board > div {
+    display: block;
+    height: 100%;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-board-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <!-- We color ops in the graph by XLA cluster. -->
+        <tf-graph-board id="board" color-by="xla_cluster"></tf-graph-board>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-board-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Render the graph.
+              this.$.board.set('graph', slimGraph);
+              this.$.board.set('graphHierarchy', graphHierarchy);
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-board-demo></tf-graph-board-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_board_d3v4/tf-graph-board.html b/tensorflow/tensorboard/components/tf_graph_board_d3v4/tf-graph-board.html
new file mode 100644
index 0000000000000000000000000000000000000000..0ee694e1e6638f7ed8808f5d11a5c92d9ae6673f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/tf-graph-board.html
@@ -0,0 +1,255 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-graph/tf-graph.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../tf-graph-info/tf-graph-info.html">
+<link rel="import" href="../paper-progress/paper-progress.html">
+
+<!--
+Element for putting tf-graph and tf-graph-info side by side.
+
+Example
+
+  <tf-graph-board graph=[[graph]]></tf-graph-board>
+
+-->
+<dom-module id="tf-graph-board">
+<template>
+<style>
+::host {
+  display: block;
+}
+
+/deep/ .close {
+  position: absolute;
+  cursor: pointer;
+  left: 15px;
+  bottom: 15px;
+}
+
+.container {
+  width: 100%;
+  height: 100%;
+  opacity: 1;
+}
+
+.container.loading {
+  cursor: progress;
+  opacity: 0.1;
+}
+
+.container.loading.error {
+  cursor: auto;
+}
+
+#info {
+  position: absolute;
+  right: 5px;
+  top: 5px;
+  padding: 0px;
+  max-width: 380px;
+  min-width: 320px;
+  background-color: rgba(255,255,255,0.9);
+  @apply(--shadow-elevation-2dp);
+}
+
+#main {
+  width: 100%;
+  height: 100%;
+}
+
+#progress-bar {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  justify-content: center;
+  width: 100%;
+  position: absolute;
+  top: 40px;
+  left: 0;
+  font-size: 13px;
+}
+
+#progress-msg {
+  width: 400px;
+  margin-bottom: 5px;
+}
+
+paper-progress {
+  width: 400px;
+  --paper-progress-height: 6px;
+  --paper-progress-active-color: #f3913e;
+}
+
+.context-menu {
+  position: absolute;
+  display: none;
+  background-color: #e2e2e2;
+  border-radius: 2px;
+  font-size: 14px;
+  min-width: 150px;
+  border: 1px solid #d4d4d4;
+}
+
+/deep/ .context-menu ul {
+  list-style-type: none;
+  margin: 0;
+  padding: 0;
+  cursor: default;
+}
+
+/deep/ .context-menu ul li {
+  padding: 4px 16px;
+}
+
+/deep/ .context-menu ul li:hover {
+  background-color: #f3913e;
+  color: white;
+}
+</style>
+<template is="dom-if" if="[[_isNotComplete(progress)]]">
+  <div id="progress-bar">
+    <div id="progress-msg">[[progress.msg]]</div>
+    <paper-progress value="[[progress.value]]"></paper-progress>
+  </div>
+</template>
+<div class$="[[_getContainerClass(progress)]]">
+  <div id="main">
+    <tf-graph id="graph"
+              graph-hierarchy="{{graphHierarchy}}"
+              basic-graph="[[graph]]"
+              hierarchy-params="[[hierarchyParams]]"
+              render-hierarchy="{{renderHierarchy}}"
+              devices-for-stats="[[devicesForStats]]"
+              stats="[[stats]]"
+              selected-node="{{_selectedNode}}"
+              highlighted-node="{{_highlightedNode}}"
+              color-by="[[colorBy]]"
+              color-by-params="{{colorByParams}}"
+              progress="{{progress}}"
+              node-names-to-health-pills="[[nodeNamesToHealthPills]]"
+              health-pill-step-index="[[healthPillStepIndex]]"
+    ></tf-graph>
+  </div>
+  <div id="info">
+    <tf-graph-info id="graph-info"
+              title="selected"
+              graph-hierarchy="[[graphHierarchy]]"
+              render-hierarchy="[[renderHierarchy]]"
+              graph="[[graph]]"
+              selected-node="{{_selectedNode}}"
+              selected-node-include="{{_selectedNodeInclude}}"
+              highlighted-node="{{_highlightedNode}}"
+              color-by="[[colorBy]]"
+              color-by-params="[[colorByParams]]"
+              debugger-data-enabled="[[debuggerDataEnabled]]"
+              are-health-pills-loading="[[areHealthPillsLoading]]"
+              node-names-to-health-pills="[[nodeNamesToHealthPills]]"
+              all-steps-mode-enabled="{{allStepsModeEnabled}}"
+              specific-health-pill-step="{{specificHealthPillStep}}"
+              health-pill-step-index="{{healthPillStepIndex}}"
+    ></tf-graph-info>
+  </div>
+  <div class="context-menu"></div>
+</div>
+</template>
+</dom-module>
+
+<script>
+Polymer({
+  is: 'tf-graph-board',
+  properties: {
+    // Public API.
+    graphHierarchy: Object,
+    graph: Object,
+    stats: Object,
+    /**
+     * @type {value: number, msg: string}
+     *
+     * A number between 0 and 100 denoting the % of progress
+     * for the progress bar and the displayed message.
+     */
+    progress: Object,
+    colorBy: String,
+    colorByParams: {
+      type: Object,
+      notify: true
+    },
+    renderHierarchy: {
+      type: Object,
+      notify: true
+    },
+    // Whether debugger data is enabled for this instance of Tensorboard.
+    debuggerDataEnabled: Boolean,
+    // Whether health pills are currently being loaded.
+    areHealthPillsLoading: Boolean,
+    // A mapping between node name to the tf.graph.scene.HealthPill to render.
+    nodeNamesToHealthPills: Object,
+    // Whether the user can request health pills for individual steps from the server. This can be
+    // slow compared the default of showing sampled health pills.
+    allStepsModeEnabled: {
+      type: Boolean,
+      notify: true,
+      value: false,
+    },
+    // Relevant if allStepsModeEnabled. The specific step for which to fetch health pills from the
+    // server for.
+    specificHealthPillStep: {
+      type: Number,
+      notify: true,
+      value: 0,
+    },
+    // The step of health pills to show throughout the graph.
+    healthPillStepIndex: Number,
+    // Private API: Data routing between child components.
+    _selectedNode: String,
+    // The enum value of the include property of the selected node.
+    _selectedNodeInclude: Number,
+    _highlightedNode: String
+  },
+  listeners: {
+    'node-toggle-extract': '_nodeToggleExtract'
+  },
+  observers: [
+    '_updateNodeInclude(_selectedNode)'
+  ],
+  /** True if the progress is not complete yet (< 100 %). */
+  _isNotComplete: function(progress) {
+    return progress.value < 100;
+  },
+  _getContainerClass: function(progress) {
+    var result = 'container';
+    if (progress.error) {
+      result += ' error';
+    }
+    if (this._isNotComplete(progress)) {
+      result += ' loading';
+    }
+    return result;
+  },
+  _updateNodeInclude: function(nodeName) {
+    var node = this.graphHierarchy.node(nodeName);
+    this.set("_selectedNodeInclude",
+      node ? node.include : tf.graph.InclusionType.UNSPECIFIED);
+  },
+  _nodeToggleExtract: function() {
+    this._updateNodeInclude(this._selectedNode);
+  }
+});
+</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_common/BUILD b/tensorflow/tensorboard/components/tf_graph_common/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..50f7a309ed9fdd73b3588ac8a5235c93bdc8d891
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common/BUILD
@@ -0,0 +1,65 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_common",
+    srcs = [
+        "tf-graph-common.html",
+        ":ts",
+    ],
+    path = "/tf-graph-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:dagre",
+        "//tensorflow/tensorboard/components/tf_imports:graphlib",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = glob(["*.ts"]),
+    typings = [
+        "@org_definitelytyped//:d3.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-common.html",
+        ":legacy_ts",
+    ],
+    destdir = "tf-graph-common",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = glob(["*.ts"]),
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = ["//tensorflow/tensorboard/components:common_deps"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_common/graph.ts b/tensorflow/tensorboard/components/tf_graph_common/graph.ts
index 1b0abcfd85311e7c66481e76fa7f5351eaafded0..e60ee0e0f293fbe83d804fa39afdc3c00a19696d 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/graph.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/graph.ts
@@ -409,9 +409,9 @@ export function joinStatsInfoWithGraph(
       // Lookup the node in the graph by its original name, e.g. A. If not
       // found, lookup by the rewritten name A/(A) in case the name is both
       // a namespace and a node name.
-      let nodeName = nodeStats.node_name in graph.nodes ? nodeStats.node_name :
-                                                          nodeStats.node_name +
-              NAMESPACE_DELIM + '(' + nodeStats.node_name + ')';
+      const nodeName = nodeStats.node_name in graph.nodes ?
+          nodeStats.node_name :
+          getStrictName(nodeStats.node_name);
 
       // Couldn't find a matching node.
       if (!(nodeName in graph.nodes)) {
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts b/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
index af3030197e0824aaa808a8ad5b77fadf0cc856f9..5b546e9dd325d791ee83ede964e2cb1f726d0c9e 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
@@ -33,6 +33,12 @@ suite('graph', () => {
         op: "MatMul"
         input: "Q:2"
         input: "W"
+      }
+      node {
+        name: "XX/YY/(YY)"
+        op: "MatMul"
+        input: "Q:2"
+        input: "W"
       }`);
     let statsPbtxt = tf.graph.test.util.stringToArrayBuffer(`step_stats {
       dev_stats {
@@ -47,6 +53,11 @@ suite('graph', () => {
           all_start_micros: 12
           all_end_rel_micros: 4
         }
+        node_stats {
+          node_name: "XX/YY"
+          all_start_micros: 20
+          all_end_rel_micros: 4
+        }
       }
     }`);
 
@@ -64,6 +75,7 @@ suite('graph', () => {
             assert.isTrue(slimGraph.nodes['X'] != null);
             assert.isTrue(slimGraph.nodes['W'] != null);
             assert.isTrue(slimGraph.nodes['Q'] != null);
+            assert.isTrue(slimGraph.nodes['XX/YY(YY)'] != null);
 
             let firstInputOfX = slimGraph.nodes['X'].inputs[0];
             assert.equal(firstInputOfX.name, 'Q');
@@ -76,6 +88,8 @@ suite('graph', () => {
             tf.graph.parser.parseStatsPbTxt(statsPbtxt).then(stepStats => {
               tf.graph.joinStatsInfoWithGraph(slimGraph, stepStats);
               assert.equal(slimGraph.nodes['Q'].stats.getTotalMicros(), 6);
+              assert.equal(
+                  slimGraph.nodes['XX/YY/(YY)'].stats.getTotalMicros(), 4);
               done();
             });
           });
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_common_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9b7dcef8bc39a9baadd7ccbf0a3bf13f32d2d0b4
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_common_d3v4",
+    srcs = [
+        "tf-graph-common.html",
+        ":ts",
+    ],
+    path = "/tf-graph-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:dagre",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:graphlib",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = glob(["*.ts"]),
+    typings = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/annotation.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/annotation.ts
new file mode 100644
index 0000000000000000000000000000000000000000..bde382977858d7a3a3a69ea233c801c41ab7b4f0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/annotation.ts
@@ -0,0 +1,235 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.graph.scene.annotation {
+  /**
+   * Populate a given annotation container group
+   *
+   *     <g class='{in|out}-annotations'></g>
+   *
+   * with annotation group of the following structure:
+   *
+   * <g class='annotation'>
+   *   <g class='annotation-node'>
+   *   <!--
+   *   Content here determined by Scene.node.buildGroup.
+   *   -->
+   *   </g>
+   * </g>
+   *
+   * @param container selection of the container.
+   * @param annotationData node.{in|out}Annotations
+   * @param d node to build group for.
+   * @param sceneElement <tf-graph-scene> polymer element.
+   * @return selection of appended objects
+   */
+  export function buildGroup(
+      container, annotationData: render.AnnotationList,
+      d: render.RenderNodeInfo, sceneElement) {
+    // Select all children and join with data.
+    let annotationGroups =
+        container
+            .selectAll(function() {
+              // using d3's selector function
+              // See https://github.com/mbostock/d3/releases/tag/v2.0.0
+              // (It's not listed in the d3 wiki.)
+              return this.childNodes;
+            })
+            .data(annotationData.list, d => { return d.node.name; });
+
+    annotationGroups.enter()
+        .append('g')
+        .attr('data-name', a => { return a.node.name; })
+        .each(function(a) {
+          let aGroup = d3.select(this);
+
+          // Add annotation to the index in the scene
+          sceneElement.addAnnotationGroup(a, d, aGroup);
+          // Append annotation edge
+          let edgeType = Class.Annotation.EDGE;
+          let metaedge = a.renderMetaedgeInfo && a.renderMetaedgeInfo.metaedge;
+          if (metaedge && !metaedge.numRegularEdges) {
+            edgeType += ' ' + Class.Annotation.CONTROL_EDGE;
+          }
+          // If any edges are reference edges, add the reference edge class.
+          if (metaedge && metaedge.numRefEdges) {
+            edgeType += ' ' + Class.Edge.REF_LINE;
+          }
+          edge.appendEdge(aGroup, a, sceneElement, edgeType);
+
+          if (a.annotationType !== render.AnnotationType.ELLIPSIS) {
+            addAnnotationLabelFromNode(aGroup, a);
+            buildShape(aGroup, a);
+          } else {
+            addAnnotationLabel(
+                aGroup, a.node.name, a, Class.Annotation.ELLIPSIS);
+          }
+        }).merge(annotationGroups)
+        .attr(
+            'class',
+            a => {
+              return Class.Annotation.GROUP + ' ' +
+                  annotationToClassName(a.annotationType) + ' ' +
+                  node.nodeClass(a);
+            })
+        .each(function(a) {
+          let aGroup = d3.select(this);
+          update(aGroup, d, a, sceneElement);
+          if (a.annotationType !== render.AnnotationType.ELLIPSIS) {
+            addInteraction(aGroup, d, a, sceneElement);
+          }
+        });
+
+    annotationGroups.exit()
+        .each(function(a) {
+          let aGroup = d3.select(this);
+
+          // Remove annotation from the index in the scene
+          sceneElement.removeAnnotationGroup(a, d, aGroup);
+        })
+        .remove();
+    return annotationGroups;
+};
+
+/**
+ * Maps an annotation enum to a class name used in css rules.
+ */
+function annotationToClassName(annotationType: render.AnnotationType) {
+  return (render.AnnotationType[annotationType] || '').toLowerCase() || null;
+}
+
+function buildShape(aGroup, a: render.Annotation) {
+  if (a.annotationType === render.AnnotationType.SUMMARY) {
+    let summary = selectOrCreateChild(aGroup, 'use');
+    summary
+      .attr('class', 'summary')
+      .attr('xlink:href', '#summary-icon')
+      .attr('cursor', 'pointer');
+  } else {
+    let shape = node.buildShape(aGroup, a, Class.Annotation.NODE);
+    // add title tag to get native tooltips
+    selectOrCreateChild(shape, 'title').text(a.node.name);
+  }
+}
+
+function addAnnotationLabelFromNode(aGroup, a: render.Annotation) {
+  let namePath = a.node.name.split('/');
+  let text = namePath[namePath.length - 1];
+  return addAnnotationLabel(aGroup, text, a, null);
+}
+
+function addAnnotationLabel(
+    aGroup, label: string, a: render.Annotation, additionalClassNames) {
+  let classNames = Class.Annotation.LABEL;
+  if (additionalClassNames) {
+    classNames += ' ' + additionalClassNames;
+  }
+  let txtElement = aGroup.append('text')
+                       .attr('class', classNames)
+                       .attr('dy', '.35em')
+                       .attr('text-anchor', a.isIn ? 'end' : 'start')
+                       .text(label);
+
+  return tf.graph.scene.node.enforceLabelWidth(txtElement, -1);
+}
+
+function addInteraction(selection, d: render.RenderNodeInfo,
+    annotation: render.Annotation, sceneElement) {
+  selection
+      .on('mouseover',
+          a => {
+            sceneElement.fire(
+                'annotation-highlight',
+                {name: a.node.name, hostName: d.node.name});
+          })
+      .on('mouseout',
+          a => {
+            sceneElement.fire(
+                'annotation-unhighlight',
+                {name: a.node.name, hostName: d.node.name});
+          })
+      .on('click', a => {
+        // Stop this event's propagation so that it isn't also considered a
+        // graph-select.
+        (<Event>d3.event).stopPropagation();
+        sceneElement.fire(
+            'annotation-select', {name: a.node.name, hostName: d.node.name});
+      });
+  if (annotation.annotationType !== render.AnnotationType.SUMMARY &&
+      annotation.annotationType !== render.AnnotationType.CONSTANT) {
+    selection.on(
+        'contextmenu', contextmenu.getMenu(
+                           node.getContextMenu(annotation.node, sceneElement)));
+  }
+};
+
+/**
+ * Adjust annotation's position.
+ *
+ * @param aGroup selection of a 'g.annotation' element.
+ * @param d Host node data.
+ * @param a annotation node data.
+ * @param sceneElement <tf-graph-scene> polymer element.
+ */
+function update(aGroup, d: render.RenderNodeInfo, a: render.Annotation,
+    sceneElement) {
+  let cx = layout.computeCXPositionOfNodeShape(d);
+  // Annotations that point to embedded nodes (constants,summary)
+  // don't have a render information attached so we don't stylize these.
+  // Also we don't stylize ellipsis annotations (the string '... and X more').
+  if (a.renderNodeInfo &&
+      a.annotationType !== render.AnnotationType.ELLIPSIS) {
+    node.stylize(aGroup, a.renderNodeInfo, sceneElement,
+      Class.Annotation.NODE);
+  }
+
+  if (a.annotationType === render.AnnotationType.SUMMARY) {
+    // Update the width of the annotation to give space for the image.
+    a.width += 10;
+  }
+
+  // label position
+  aGroup.select('text.' + Class.Annotation.LABEL).transition()
+    .attr('x', cx + a.dx + (a.isIn ? -1 : 1) * (a.width / 2 + a.labelOffset))
+    .attr('y', d.y + a.dy);
+
+  // Some annotations (such as summary) are represented using a 12x12 image tag.
+  // Purposely omitted units (e.g. pixels) since the images are vector graphics.
+  // If there is an image, we adjust the location of the image to be vertically
+  // centered with the node and horizontally centered between the arrow and the
+  // text label.
+  aGroup.select('use.summary').transition()
+    .attr('x', cx + a.dx - 3)
+    .attr('y', d.y + a.dy - 6);
+
+  // Node position (only one of the shape selection will be non-empty.)
+  positionEllipse(
+      aGroup.select('.' + Class.Annotation.NODE + ' ellipse'), cx + a.dx,
+      d.y + a.dy, a.width, a.height);
+  positionRect(
+      aGroup.select('.' + Class.Annotation.NODE + ' rect'), cx + a.dx,
+      d.y + a.dy, a.width, a.height);
+  positionRect(
+      aGroup.select('.' + Class.Annotation.NODE + ' use'), cx + a.dx,
+      d.y + a.dy, a.width, a.height);
+
+  // Edge position
+  aGroup.select('path.' + Class.Annotation.EDGE).transition().attr('d', a => {
+    // map relative position to absolute position
+    let points = a.points.map(p => { return {x: p.dx + cx, y: p.dy + d.y}; });
+    return edge.interpolate(points);
+  });
+};
+
+} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/colors.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/colors.ts
new file mode 100644
index 0000000000000000000000000000000000000000..40f91f7d2dbde23d20fe7f5f694994a4beb3b94f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/colors.ts
@@ -0,0 +1,130 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+module tf {
+  /**
+   * Mapping from color palette name to color palette, which contains
+   * exact colors for multiple states of a single color palette.
+   */
+  export let COLORS = [
+    {
+      'name': 'Google Blue',
+      'color': '#4184f3',
+      'active': '#3a53c5',
+      'disabled': '#cad8fc'
+    },
+    {
+      'name': 'Google Red',
+      'color': '#db4437',
+      'active': '#8f2a0c',
+      'disabled': '#e8c6c1'
+    },
+    {
+      'name': 'Google Yellow',
+      'color': '#f4b400',
+      'active': '#db9200',
+      'disabled': '#f7e8b0'
+    },
+    {
+      'name': 'Google Green',
+      'color': '#0f9d58',
+      'active': '#488046',
+      'disabled': '#c2e1cc'
+    },
+    {
+      'name': 'Purple',
+      'color': '#aa46bb',
+      'active': '#5c1398',
+      'disabled': '#d7bce6'
+    },
+    {
+      'name': 'Teal',
+      'color': '#00abc0',
+      'active': '#47828e',
+      'disabled': '#c2eaf2'
+    },
+    {
+      'name': 'Deep Orange',
+      'color': '#ff6f42',
+      'active': '#ca4a06',
+      'disabled': '#f2cbba'
+    },
+    {
+      'name': 'Lime',
+      'color': '#9d9c23',
+      'active': '#7f771d',
+      'disabled': '#f1f4c2'
+    },
+    {
+      'name': 'Indigo',
+      'color': '#5b6abf',
+      'active': '#3e47a9',
+      'disabled': '#c5c8e8'
+    },
+    {
+      'name': 'Pink',
+      'color': '#ef6191',
+      'active': '#ca1c60',
+      'disabled': '#e9b9ce'
+    },
+    {
+      'name': 'Deep Teal',
+      'color': '#00786a',
+      'active': '#2b4f43',
+      'disabled': '#bededa'
+    },
+    {
+      'name': 'Deep Pink',
+      'color': '#c1175a',
+      'active': '#75084f',
+      'disabled': '#de8cae'
+    },
+    {
+      'name': 'Gray',
+      'color': '#9E9E9E',   // 500
+      'active': '#424242',  // 800
+      'disabled': 'F5F5F5'  // 100
+    }
+  ].reduce((m, c) => {
+    m[c.name] = c;
+    return m;
+  }, {});
+
+  /**
+   * Mapping from op category to color palette name
+   * e.g.,  OP_GROUP_COLORS['state_ops'] = 'Google Blue';
+   */
+  export let OP_GROUP_COLORS = [
+    {
+      color: 'Google Red',
+      groups: [
+        'gen_legacy_ops', 'legacy_ops', 'legacy_flogs_input',
+        'legacy_image_input', 'legacy_input_example_input',
+        'legacy_sequence_input', 'legacy_seti_input_input'
+      ]
+    },
+    {color: 'Deep Orange', groups: ['constant_ops']},
+    {color: 'Indigo', groups: ['state_ops']},
+    {color: 'Purple', groups: ['nn_ops', 'nn']},
+    {color: 'Google Green', groups: ['math_ops']},
+    {color: 'Lime', groups: ['array_ops']},
+    {color: 'Teal', groups: ['control_flow_ops', 'data_flow_ops']},
+    {color: 'Pink', groups: ['summary_ops']},
+    {color: 'Deep Pink', groups: ['io_ops']}
+  ].reduce((m, c) => {
+    c.groups.forEach(function(group) { m[group] = c.color; });
+    return m;
+  }, {});
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/common.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/common.ts
new file mode 100644
index 0000000000000000000000000000000000000000..e7eac54e58fa50407c4a979b6eb6f2d22baf88af
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/common.ts
@@ -0,0 +1,31 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * @fileoverview Common interfaces for the tensorflow graph visualizer.
+ */
+
+module tf {
+  /**
+   * Tracks task progress. Each task being passed a progress tracker needs
+   * to call the below-defined methods to notify the caller about the gradual
+   * progress of the task.
+   */
+  export interface ProgressTracker {
+    updateProgress(incrementValue: number): void;
+    setMessage(msg: string): void;
+    reportError(msg: string, err: Error): void;
+  }
+} // close module tf
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/contextmenu.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/contextmenu.ts
new file mode 100644
index 0000000000000000000000000000000000000000..8121cf9f6dab97347efa33e388ecc8f2fb4e9d38
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/contextmenu.ts
@@ -0,0 +1,75 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+module tf.graph.scene.contextmenu {
+
+/** Function that converts data to a title string. */
+export interface TitleFunction {
+  (data: any): string;
+}
+
+/** Function that takes action based on item clicked in the context menu. */
+export interface ActionFunction {
+  (elem: any, d: any, i: number): void;
+}
+
+/**
+ * The interface for an item in the context menu
+ */
+export interface ContextMenuItem {
+  title: TitleFunction;
+  action: ActionFunction;
+}
+
+/**
+ * Returns the event listener, which can be used as an argument for the d3
+ * selection.on function. Renders the context menu that is to be displayed
+ * in response to the event.
+ */
+export function getMenu(menu: ContextMenuItem[]) {
+  let menuSelection = d3.select('.context-menu');
+  // Close the menu when anything else is clicked.
+  d3.select('body').on(
+      'click.context', function() { menuSelection.style('display', 'none'); });
+
+  // Function called to populate the context menu.
+  return function(data, index: number): void {
+    // Position and display the menu.
+    let event = <MouseEvent>d3.event;
+    menuSelection
+      .style('display', 'block')
+      .style('left', (event.layerX + 1) + 'px')
+      .style('top', (event.layerY + 1) + 'px');
+
+    // Stop the event from propagating further.
+    event.preventDefault();
+    event.stopPropagation();
+
+    // Add provided items to the context menu.
+    menuSelection.html('');
+    let list = menuSelection.append('ul');
+    list.selectAll('li')
+        .data(menu)
+        .enter()
+        .append('li')
+        .html(function(d) { return d.title(data); })
+        .on('click', (d, i) => {
+          d.action(this, data, index);
+          menuSelection.style('display', 'none');
+        });
+  };
+};
+
+} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/edge.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/edge.ts
new file mode 100644
index 0000000000000000000000000000000000000000..f3768e169b70945aa64541359c818168751d7d3f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/edge.ts
@@ -0,0 +1,343 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.graph.scene.edge {
+
+/** Delimiter between dimensions when showing sizes of tensors. */
+const TENSOR_SHAPE_DELIM = '×';
+
+/** The minimum stroke width of an edge. */
+export const MIN_EDGE_WIDTH = 0.75;
+
+/** The maximum stroke width of an edge. */
+export const MAX_EDGE_WIDTH = 12;
+
+/** The exponent used in the power scale for edge thickness. */
+const EDGE_WIDTH_SCALE_EXPONENT = 0.3;
+
+/** The domain (min and max value) for the edge width. */
+const DOMAIN_EDGE_WIDTH_SCALE = [1, 5E6];
+
+export const EDGE_WIDTH_SCALE = d3.scalePow()
+      .exponent(EDGE_WIDTH_SCALE_EXPONENT)
+      .domain(DOMAIN_EDGE_WIDTH_SCALE)
+      .range([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH])
+      .clamp(true);
+
+let arrowheadMap =
+    d3.scaleQuantize<String>().domain([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH]).range([
+      'small', 'medium', 'large', 'xlarge'
+    ]);
+
+/** Minimum stroke width to put edge labels in the middle of edges */
+const CENTER_EDGE_LABEL_MIN_STROKE_WIDTH = 2.5;
+
+export type EdgeData = {v: string, w: string, label: render.RenderMetaedgeInfo};
+
+export function getEdgeKey(edgeObj: EdgeData) {
+  return edgeObj.v + EDGE_KEY_DELIM + edgeObj.w;
+}
+
+/**
+ * Select or Create a 'g.edges' group to a given sceneGroup
+ * and builds a number of 'g.edge' groups inside the group.
+ *
+ * Structure Pattern:
+ *
+ * <g class='edges'>
+ *   <g class='edge'>
+ *     <path class='edgeline'/>
+ *   </g>
+ *   ...
+ * </g>
+ *
+ *
+ * @param sceneGroup container
+ * @param graph
+ * @param sceneElement <tf-graph-scene> polymer element.
+ * @return selection of the created nodeGroups
+ */
+export function buildGroup(sceneGroup,
+    graph: graphlib.Graph<render.RenderNodeInfo, render.RenderMetaedgeInfo>,
+    sceneElement) {
+  let edges: EdgeData[] = [];
+  edges = _.reduce(graph.edges(), (edges, edgeObj) => {
+    let edgeLabel = graph.edge(edgeObj);
+    edges.push({
+      v: edgeObj.v,
+      w: edgeObj.w,
+      label: edgeLabel
+    });
+    return edges;
+  }, edges);
+
+  let container =
+      scene.selectOrCreateChild(sceneGroup, 'g', Class.Edge.CONTAINER);
+
+  // Select all children and join with data.
+  // (Note that all children of g.edges are g.edge)
+  let edgeGroups = (container as any).selectAll('g.edge').data(edges, getEdgeKey);
+
+  // Make edges a group to support rendering multiple lines for metaedge
+  edgeGroups.enter()
+      .append('g')
+      .attr('class', Class.Edge.GROUP)
+      .attr('data-edge', getEdgeKey)
+      .each(function(d: EdgeData) {
+        let edgeGroup = d3.select(this);
+        d.label.edgeGroup = edgeGroup;
+        // index node group for quick highlighting
+        sceneElement._edgeGroupIndex[getEdgeKey(d)] = edgeGroup;
+
+        // Add line during enter because we're assuming that type of line
+        // normally does not change.
+        appendEdge(edgeGroup, d, sceneElement);
+      })
+      .merge(edgeGroups)
+      .each(position)
+      .each(function(d) {
+    stylize(d3.select(this), d, sceneElement);
+  });
+
+  edgeGroups.exit()
+    .each(d => {
+      delete sceneElement._edgeGroupIndex[getEdgeKey(d)];
+    })
+    .remove();
+  return edgeGroups;
+};
+
+/**
+ * Returns the label for the given base edge.
+ * The label is the shape of the underlying tensor.
+ */
+export function getLabelForBaseEdge(
+    baseEdge: BaseEdge, renderInfo: render.RenderGraphInfo): string {
+  let node = <OpNode>renderInfo.getNodeByName(baseEdge.v);
+  if (node.outputShapes == null || node.outputShapes.length === 0) {
+    return null;
+  }
+  let shape = node.outputShapes[baseEdge.outputTensorIndex];
+  if (shape == null) {
+    return null;
+  }
+  if (shape.length === 0) {
+    return 'scalar';
+  }
+  return shape.map(size => { return size === -1 ? '?' : size; })
+      .join(TENSOR_SHAPE_DELIM);
+}
+
+/**
+ * Creates the label for the given metaedge. If the metaedge consists
+ * of only 1 tensor, and it's shape is known, the label will contain that
+ * shape. Otherwise, the label will say the number of tensors in the metaedge.
+ */
+export function getLabelForEdge(metaedge: Metaedge,
+    renderInfo: render.RenderGraphInfo): string {
+  let isMultiEdge = metaedge.baseEdgeList.length > 1;
+  return isMultiEdge ?
+      metaedge.baseEdgeList.length + ' tensors' :
+      getLabelForBaseEdge(metaedge.baseEdgeList[0], renderInfo);
+}
+
+/**
+ * Shortens the path enought such that the tip of the start/end marker will
+ * point to the start/end of the path. The marker can be of arbitrary size.
+ *
+ * @param points Array of path control points.
+ * @param marker D3 selection of the <marker> svg element.
+ * @param isStart Is the marker a `start-marker`. If false, the marker is
+ *     an `end-marker`.
+ * @return The new array of control points.
+ */
+function adjustPathPointsForMarker(points: render.Point[],
+    marker: d3.Selection<any, any, any, any>, isStart: boolean): render.Point[] {
+  let lineFunc = d3.line<render.Point>()
+    .x(d => d.x)
+    .y(d => d.y);
+  let path =
+      d3.select(document.createElementNS('http://www.w3.org/2000/svg', 'path'))
+          .attr('d', lineFunc(points));
+  let markerWidth = +marker.attr('markerWidth');
+  let viewBox = marker.attr('viewBox').split(' ').map(Number);
+  let viewBoxWidth = viewBox[2] - viewBox[0];
+  let refX = +marker.attr('refX');
+  let pathNode = <SVGPathElement> path.node();
+  if (isStart) {
+    let fractionStickingOut = refX / viewBoxWidth;
+    let length = markerWidth * fractionStickingOut;
+    let point = pathNode.getPointAtLength(length);
+    // Figure out how many segments of the path we need to remove in order
+    // to shorten the path.
+    let segIndex = pathNode.getPathSegAtLength(length);
+    // Update the very first segment.
+    points[segIndex - 1] = {x: point.x, y: point.y};
+    // Ignore every point before segIndex - 1.
+    return points.slice(segIndex - 1);
+  } else {
+    let fractionStickingOut = 1 - refX / viewBoxWidth;
+    let length = pathNode.getTotalLength() - markerWidth * fractionStickingOut;
+    let point = pathNode.getPointAtLength(length);
+    // Figure out how many segments of the path we need to remove in order
+    // to shorten the path.
+    let segIndex = pathNode.getPathSegAtLength(length);
+    // Update the very last segment.
+    points[segIndex] = {x: point.x, y: point.y};
+    // Ignore every point after segIndex.
+    return points.slice(0, segIndex + 1);
+  }
+}
+
+/**
+ * For a given d3 selection and data object, create a path to represent the
+ * edge described in d.label.
+ *
+ * If d.label is defined, it will be a RenderMetaedgeInfo instance. It
+ * will sometimes be undefined, for example for some Annotation edges for which
+ * there is no underlying Metaedge in the hierarchical graph.
+ */
+export function appendEdge(edgeGroup, d: EdgeData,
+    sceneElement: {renderHierarchy: render.RenderGraphInfo},
+    edgeClass?: string) {
+  let size = 1;
+  if (d.label != null && d.label.metaedge != null) {
+    // There is an underlying Metaedge.
+    size = d.label.metaedge.totalSize;
+  }
+  edgeClass = edgeClass || Class.Edge.LINE; // set default type
+
+  if (d.label && d.label.structural) {
+    edgeClass += ' ' + Class.Edge.STRUCTURAL;
+  }
+  // Give the path a unique id, which will be used to link
+  // the textPath (edge label) to this path.
+  let pathId = 'path_' + getEdgeKey(d);
+  let strokeWidth = sceneElement.renderHierarchy.edgeWidthScale(size);
+
+  let path = edgeGroup.append('path')
+                 .attr('id', pathId)
+                 .attr('class', edgeClass)
+                 .style('stroke-width', strokeWidth + 'px');
+
+  // Check if there is a reference edge and add an arrowhead of the right size.
+  if (d.label && d.label.metaedge && d.label.metaedge.numRefEdges) {
+    let markerId = `ref-arrowhead-${arrowheadMap(strokeWidth)}`;
+    path.style('marker-start', `url(#${markerId})`);
+    d.label.startMarkerId = markerId;
+  }
+
+  if (d.label == null || d.label.metaedge == null) {
+    // There is no associated metaedge, thus no text.
+    // This happens for annotation edges.
+    return;
+  }
+  let labelForEdge = getLabelForEdge(d.label.metaedge,
+      sceneElement.renderHierarchy);
+  if (labelForEdge == null) {
+    // We have no information to show on this edge.
+    return;
+  }
+
+  // Put edge label in the middle of edge only if the edge is thick enough.
+  let baseline = strokeWidth > CENTER_EDGE_LABEL_MIN_STROKE_WIDTH ?
+      'central' :
+      'text-after-edge';
+
+  edgeGroup.append('text')
+      .append('textPath')
+        .attr('xlink:href', '#' + pathId)
+        .attr('startOffset', '50%')
+        .attr('text-anchor', 'middle')
+        .attr('dominant-baseline', 'central')
+      .text(labelForEdge);
+};
+
+export let interpolate = d3.line<{x: number, y: number}>()
+                             .curve(d3.curveBasis)
+                             .x((d) => { return d.x;})
+                             .y((d) => { return d.y;});
+
+/**
+ * Returns a tween interpolator for the endpoint of an edge path.
+ */
+function getEdgePathInterpolator(d: EdgeData, i: number, a: string) {
+  let renderMetaedgeInfo = <render.RenderMetaedgeInfo> d.label;
+  let adjoiningMetaedge = renderMetaedgeInfo.adjoiningMetaedge;
+  let points = renderMetaedgeInfo.points;
+
+  // Adjust the path so that start/end markers point to the end
+  // of the path.
+  if (d.label.startMarkerId) {
+    points = adjustPathPointsForMarker(
+        points, d3.select('#' + d.label.startMarkerId), true);
+  }
+  if (d.label.endMarkerId) {
+    points = adjustPathPointsForMarker(
+        points, d3.select('#' + d.label.endMarkerId), false);
+  }
+
+  if (!adjoiningMetaedge) {
+    return d3.interpolate(a, interpolate(points));
+  }
+
+  let renderPath = this;
+
+  // Get the adjoining path that matches the adjoining metaedge.
+  let adjoiningPath =
+    <SVGPathElement>((<HTMLElement>adjoiningMetaedge.edgeGroup.node())
+      .firstChild);
+
+  // Find the desired SVGPoint along the adjoining path, then convert those
+  // coordinates into the space of the renderPath using its Current
+  // Transformation Matrix (CTM).
+  let inbound = renderMetaedgeInfo.metaedge.inbound;
+
+  return function(t) {
+    let adjoiningPoint = adjoiningPath
+      .getPointAtLength(inbound ? adjoiningPath.getTotalLength() : 0)
+      .matrixTransform(adjoiningPath.getCTM())
+      .matrixTransform(renderPath.getCTM().inverse());
+
+    // Update the relevant point in the renderMetaedgeInfo's points list, then
+    // re-interpolate the path.
+    let index = inbound ? 0 : points.length - 1;
+    points[index].x = adjoiningPoint.x;
+    points[index].y = adjoiningPoint.y;
+    let dPath = interpolate(points);
+    return dPath;
+  };
+}
+
+function position(d) {
+  d3.select(this)
+      .select('path.' + Class.Edge.LINE)
+      .transition()
+      .attrTween('d', getEdgePathInterpolator as any);
+};
+
+/**
+ * For a given d3 selection and data object, mark the edge as a control
+ * dependency if it contains only control edges.
+ *
+ * d's label property will be a RenderMetaedgeInfo object.
+ */
+function stylize(edgeGroup, d: EdgeData, stylize) {
+  edgeGroup.classed('faded', d.label.isFadedOut);
+  let metaedge = d.label.metaedge;
+  edgeGroup.select('path.' + Class.Edge.LINE)
+      .classed('control-dep', metaedge && !metaedge.numRegularEdges);
+};
+
+} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/externs.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/externs.ts
new file mode 100644
index 0000000000000000000000000000000000000000..7c0d168a4298c30a3554c9079d6573a9b63a76f6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/externs.ts
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * @fileoverview Extern declarations for tensorflow graph visualizer.
+ *     This file contains compiler stubs for external dependencies whos
+ *     implementations are defined at runtime.
+ */
+
+declare module graphlib {
+  interface GraphOptions {
+    name?: string;
+    /**
+     * Direction for rank nodes. Can be TB, BT, LR, or RL, where T = top,
+     * B = bottom, L = left, and R = right.
+     */
+    rankdir?: string;
+    type?: string|number;
+    /** Number of pixels between each rank in the layout. */
+    ranksep?: number;
+    /** Number of pixels that separate nodes horizontally in the layout. */
+    nodesep?: number;
+    /** Number of pixels that separate edges horizontally in the layout */
+    edgesep?: number;
+  }
+
+  export interface EdgeObject {
+    v: string;
+    w: string;
+    name?: string;
+  }
+
+  export class Graph<N, E> {
+    constructor(opt?: Object);
+    setNode(name: string, value?: N): void;
+    hasNode(name: string): boolean;
+    setEdge(fromName: string, toName: string, value?: E): void;
+    hasEdge(fromName: string, toName: string): boolean;
+    edge(fromName: string, toName: string): E;
+    edge(edgeObject: EdgeObject): E;
+    removeEdge(v: string, w: string): void;
+    nodes(): string[];
+    node(name: string): N;
+    removeNode(name: string): void;
+    setGraph(graphOptions: GraphOptions): void;
+    graph(): GraphOptions;
+    nodeCount(): number;
+    neighbors(name: string): string[];
+    successors(name: string): string[];
+    predecessors(name: string): string[];
+    edges(): EdgeObject[];
+    outEdges(name: string): E[];
+    inEdges(name: string): E[];
+    /**
+     * Returns those nodes in the graph that have no in-edges.
+     * Takes O(|V|) time.
+     */
+    sources(): string[];
+    /**
+     * Remove the node with the id v in the graph or do nothing if
+     * the node is not in the graph. If the node was removed this
+     * function also removes any incident edges. Returns the graph,
+     * allowing this to be chained with other functions. Takes O(|E|) time.
+     */
+    removeNode(name: string): Graph<N, E>;
+    setParent(name: string, parentName: string): void;
+  }
+}
+
+/**
+ * Declaring dagre var used for dagre layout.
+ */
+declare var dagre: {layout(graph: graphlib.Graph<any, any>): void;};
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/graph.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/graph.ts
new file mode 100644
index 0000000000000000000000000000000000000000..1b0abcfd85311e7c66481e76fa7f5351eaafded0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/graph.ts
@@ -0,0 +1,1257 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.graph {
+
+/** Delimiter used in node names to denote namespaces. */
+export const NAMESPACE_DELIM = '/';
+export const ROOT_NAME = '__root__';
+
+/** Attribute key used for storing attributes that are too large. */
+export const LARGE_ATTRS_KEY = '_too_large_attrs';
+/**
+ * Maximum allowed size in bytes, before the attribute is considered large
+ * and filtered out of the graph.
+ */
+export const LIMIT_ATTR_SIZE = 1024;
+
+// Separator between the source and the destination name of the edge.
+export const EDGE_KEY_DELIM = '--';
+
+export enum GraphType {FULL, EMBEDDED, META, SERIES, CORE, SHADOW, BRIDGE,
+    EDGE};
+export enum NodeType {META, OP, SERIES, BRIDGE, ELLIPSIS};
+
+/** Indicates if a node is to be included in the main graph when rendered. */
+export enum InclusionType {INCLUDE, EXCLUDE, UNSPECIFIED};
+
+/** Indicates if a series is to be grouped in the graph when rendered. */
+export enum SeriesGroupingType {GROUP, UNGROUP};
+
+/** Attribute key reserved for the shapes of the output tensors. */
+const OUTPUT_SHAPES_KEY = '_output_shapes';
+
+/** Attribute key reserved for the XLA cluster that an op runs on. */
+const _XLA_CLUSTER_KEY = '_XlaCluster';
+
+/**
+ * A BaseEdge is the label object (in the graphlib sense) for an edge in the
+ * original, full graph produced after parsing. Subsequent graphs, like those
+ * which belong to Metanodes, should not use BaseEdge objects, but instead
+ * contain Metaedges (which in turn may contain any number of BaseEdges).
+ */
+export interface BaseEdge extends graphlib.EdgeObject {
+  isControlDependency: boolean;
+  isReferenceEdge: boolean;
+  /** The index of the output tensor of the source node. */
+  outputTensorIndex: number;
+}
+
+/**
+ * A SlimGraph is inspired by graphlib.Graph, but having only the functionality
+ * that we need.
+ */
+export class SlimGraph {
+  nodes: { [nodeName: string]: OpNode };
+  edges: BaseEdge[];
+
+  constructor() {
+    this.nodes = {};
+    this.edges = [];
+  }
+}
+
+export interface NormalizedInput {
+  name: string;
+  /** The index of the output tensor of the source node. */
+  outputTensorIndex: number;
+  isControlDependency: boolean;
+}
+
+export interface BuildParams {
+  enableEmbedding: boolean;
+  inEmbeddingTypes: string[];
+  outEmbeddingTypes: string[];
+  refEdges: { [inputEdge: string]: boolean };
+}
+
+/**
+ * The most basic information about a node in the hierarchical graph.
+ */
+export interface Node {
+  /** The name of the node, used frequently to look up nodes by name. */
+  name: string;
+  /** Which type of node this is. */
+  type: NodeType;
+  /**
+   * Whether this node is a type that may contain other nodes. Those types
+   * should extend from GroupNode.
+   *
+   * For an OpNode, isGroupNode will be false, even though it may have
+   * embeddings. These embedding Nodes will have their parentNode set to the
+   * OpNode. However, embeddings are later rendered as annotations, not as
+   * children to be made visible on expansion (like a Metanode or SeriesNode).
+   */
+  isGroupNode: boolean;
+  /**
+   * The number of nodes this node represents. For OpNodes, this will be 1, and
+   * for GroupNodes it will be a count of the total number of descendents it
+   * contains.
+   */
+  cardinality: number;
+  /**
+   * The Node which is this Node's parent. This is of type Node and not
+   * GroupNode because of embeddings, which will have a parent OpNode.
+   */
+  parentNode: Node;
+  /** Runtime execution stats for this node, if available */
+  stats: NodeStats;
+  /** If the node is to be included or excluded from the main graph when
+   *  rendered. Defaults to UNSPECIFIED, which means that the rendering
+   *  algorithm determines if it will be included or not. Then can be set to
+   *  INCLUDE or EXCLUDE manually by the user.
+   */
+  include: InclusionType;
+  /**
+   * Node attributes specify customizable visual aspects of a node and
+   * application-specific metadata associated with a node. The name
+   * 'nodeAttributes' is meant to avoid naming-conflicts with the 'attr' in
+   * subclasses of Node.
+   */
+  nodeAttributes: {[key: string]: any;};
+}
+
+export type TensorShape = number[];
+
+export interface OpNode extends Node {
+  op: string;
+  // The device on which the op ran. Null if it is unknown.
+  device: string;
+  attr: {key: string, value: any}[];
+  inputs: NormalizedInput[];
+  inEmbeddings: OpNode[];
+  outEmbeddings: OpNode[];
+  // The name of the SeriesNode that can contain this node in its series.
+  // If there is no such node, then this is null.
+  owningSeries: string;
+  /**
+   * Array of tensor shapes. Null if the number of output tensors is unknown,
+   * otherwise the length will equal the number of output tensors.
+   *
+   * Each tensor shape is an array of numbers, or null. Details:
+   * - null means unknown rank, and therefore entire shape is unknown.
+   * - [4, 2, 1] means rank-3 tensor of size 4x2x1.
+   * - [] means a scalar (rank-0 tensor).
+   * - [1] means rank-1 tensor of size 1 (not the same as scalar).
+   * - [5, -1, 3] means rank-3 tensor of shape is 5x?x3. The size
+   *       of the middle dimension is unknown (encoded as -1).
+   */
+  outputShapes: TensorShape[];
+  // The XLA Cluster on which the op ran. Null if it is unknown.
+  xlaCluster: string;
+}
+
+export interface BridgeNode extends Node {
+  /**
+   * Whether this bridge node represents edges coming into its parent node.
+   */
+  inbound: boolean;
+}
+
+/**
+ * A node that is used when there are more than the maximum number of allowed
+ * annotations hanging off of a node.  This node represents an ellipsis
+ * annotation, indicating a number of additional annotations.
+ */
+export interface EllipsisNode extends Node {
+  /**
+   * The number of nodes this ellipsis represents.
+   */
+  numMoreNodes: number;
+
+  /**
+   * Sets the number of nodes this ellipsis represents and changes the node
+   * name accordingly.
+   */
+  setNumMoreNodes(numNodes: number);
+}
+
+export interface GroupNode extends Node {
+  /**
+   * The metagraph contains nodes and metaedges between the immediate children
+   * of this group. The node label objects may be other GroupNodes (like
+   * SeriesNodes and Metanodes) or individual OpNodes. All edge label objects
+   * are Metaedges, each of which contains references to the original
+   * BaseEdge(s) from which it was created.
+   */
+  metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
+
+  /**
+   * The bridgegraph contains only edges which link immediate children of this
+   * group with nodes outside of the metagraph. As in the metagraph, all edge
+   * label objects are Metaedges which contain references to the original
+   * BaseEdge(s) that contribute to it.
+   *
+   * For a Metaedge in the bridgegraph, its external endpoint will be the same
+   * as the metagraph edge from which it came. This is most easily explained
+   * by example.
+   *
+   * Consider an original graph that contains a BaseEdge A/B/C->Z/Y/X.
+   *
+   *     +-------+    (BaseEdge)     +-------+
+   *     | A/B/C |>----------------->| Z/Y/X |
+   *     +-------+                   +-------+
+   *
+   * When we construct the Root's metagraph, it will contain nodes for A and Z,
+   * and a Metaedge A->Z. The A->Z Metaedge will contain the original BaseEdge
+   * A/B/C->Z/Y/X in its baseEdgeGraph. The Root's bridgegraph will always be
+   * empty.
+   *
+   *     +---+    (Root.metagraph edge)    +---+
+   *     | A |>--------------------------->| Z |
+   *     +---+                             +---+
+   *
+   * Now consider the Metanode A. Its metagraph will contain a Metanode for A/B
+   * and no edges. A's bridgegraph will have one Metaedge from A/B->Z, which
+   * was derived from the Root's Metaedge A->Z. That Metaedge will contain the
+   * original BaseEdge in its baseEdgeGraph.
+   *
+   *     +---------+
+   *     | A       |
+   *     |  +---+  |   (A.bridgegraph edge)    +---+
+   *     |  | B |>---------------------------->| Z |
+   *     |  +---+  |                           +---+
+   *     +---------+
+   *
+   * Finally, consider the Metanode A/B. Its metagraph will contain a Metanode
+   * for A/B/C and again no edges. A/B's bridgegraph will have one Metaedge
+   * from A/B/C->Z, which was derived from A's bridgegraph Metaedge A/B->Z.
+   * As before, the A/B/C->Z Metaedge will contain the original BaseEdge in its
+   * baseEdgeGraph.
+   *
+   *     +---------------+
+   *     | A             |
+   *     |  +---------+  |
+   *     |  | B       |  |
+   *     |  |  +---+  |  |   (A/B.bridgegraph edge)      +---+
+   *     |  |  | C |>----------------------------------->| Z |
+   *     |  |  +---+  |  |                               +---+
+   *     |  +---------+  |
+   *     +---------------+
+   *
+   * Likewise, under the Metanode Z and Z/Y, to compute the bridgegraph, we'll
+   * end up with Metaedges A->Z/Y and A->Z/Y/X respectively. So the original
+   * BaseEdge A/B/C->Z/Y/X becomes four different Metaedges in four different
+   * bridgegraphs:
+   *
+   *   + A/B->Z in GroupNode A's bridgegraph,
+   *   + A/B/C->Z in GroupNode A/B's bridgegraph,
+   *   + A->Z/Y in GroupNode Z's bridgegraph, and
+   *   + A->Z/Y/X in GroupNode Z/Y's bridgegraph.
+   *
+   * Considering any BaseEdge then, if N is the number of path segments in the
+   * source and M is the number of path segments in the destination, then the
+   * total number of bridgegraph edges you could create would be (N-1)(M-1).
+   *
+   * For this reason, it is computationally expensive to generate all the
+   * bridgegraphs for all the Metanodes, and instead they should be computed
+   * on demand as needed.
+   */
+  bridgegraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
+
+  /**
+   * Stores how many times each device name appears in its children
+   * op nodes. Used to color group nodes by devices.
+   */
+  deviceHistogram: {[device: string]: number};
+
+  /**
+   * Flag indicating whether this GroupNode's metagraph contains any edges that
+   * are not control edges. Used to quickly determine how to draw a collapsed
+   * series (vertically or horizontally).
+   */
+  hasNonControlEdges: boolean;
+}
+
+export interface Metanode extends GroupNode {
+  depth: number;
+  templateId: string;
+  opHistogram: {[op: string]: number};
+  getFirstChild(): GroupNode|OpNode;
+  getRootOp(): OpNode;
+  /** Return name of all leaves inside a metanode. */
+  leaves(): string[];
+}
+
+export interface SeriesNode extends GroupNode {
+  hasLoop: boolean;
+  prefix: string;
+  suffix: string;
+  clusterId: number;
+  ids: number[];
+  parent: string;
+}
+
+export class EllipsisNodeImpl implements EllipsisNode {
+  name: string;
+  numMoreNodes: number;
+  stats: NodeStats;
+  type: NodeType;
+  isGroupNode: boolean;
+  cardinality: number;
+  parentNode: Node;
+  include: InclusionType;
+  nodeAttributes: {[key: string]: any;};
+  /**
+   * Constructs a new ellipsis annotation node.
+   *
+   * @param numNodes The number of additional annotations this node represents.
+   */
+  constructor(numNodes: number) {
+    this.type = NodeType.ELLIPSIS;
+    this.isGroupNode = false;
+    this.cardinality = 1;
+    this.parentNode = null;
+    this.stats = null;
+    this.setNumMoreNodes(numNodes);
+    this.include = InclusionType.UNSPECIFIED;
+  }
+
+  setNumMoreNodes(numNodes: number) {
+    this.numMoreNodes = numNodes;
+    this.name = '... ' + numNodes + ' more';
+  }
+};
+
+/**
+ * A label object for nodes in the full graph and leaf nodes in the render
+ * graph.
+ */
+export class OpNodeImpl implements OpNode {
+  name: string;
+  op: string;
+  device: string;
+  stats: NodeStats;
+  attr: {key: string, value: any}[];
+  inputs: NormalizedInput[];
+  type: NodeType;
+  isGroupNode: boolean;
+  cardinality: number;
+  inEmbeddings: OpNode[];
+  outEmbeddings: OpNode[];
+  parentNode: Node;
+  include: InclusionType;
+  owningSeries: string;
+  outputShapes: TensorShape[];
+  nodeAttributes: {[key: string]: any;};
+  xlaCluster: string;
+
+  /**
+   * Constructs a new Op node.
+   *
+   * @param rawNode The raw node.
+   */
+  constructor(rawNode: tf.graph.proto.NodeDef) {
+    this.op = rawNode.op;
+    this.name = rawNode.name;
+    this.device = rawNode.device;
+    this.attr = rawNode.attr;
+    // An array of normalized inputs that denote the incoming edges to
+    // the current node. Each input contains the normalized name of the
+    // source node, whether it has a number part and whether it is a
+    // control dependency.
+    this.inputs = normalizeInputs(rawNode.input);
+    this.outputShapes = extractOutputShapes(rawNode.attr);
+    this.xlaCluster = extractXlaCluster(rawNode.attr);
+    // additional properties
+    this.type = NodeType.OP;
+    this.isGroupNode = false;
+    this.cardinality = 1;
+    this.inEmbeddings = [];
+    this.outEmbeddings = [];
+    this.parentNode = null;
+    this.include = InclusionType.UNSPECIFIED;
+    this.owningSeries = null;
+  }
+};
+
+export function createMetanode(name: string, opt = {}): Metanode {
+  return new MetanodeImpl(name, opt);
+}
+
+/**
+ * Joins the information from the stats file (memory, compute time) with the
+ * graph information.
+ */
+export function joinStatsInfoWithGraph(
+    graph: SlimGraph, stats: tf.graph.proto.StepStats,
+    devicesForStats?: {[device: string]: boolean}): void {
+  // Reset stats for each node.
+  _.each(graph.nodes, node => { node.stats = null; });
+
+  _.each(stats.dev_stats, devStats => {
+    // Ignore devices that are not selected.
+    if (devicesForStats && !devicesForStats[devStats.device]) {
+      return;
+    }
+    _.each(devStats.node_stats, nodeStats => {
+      // Lookup the node in the graph by its original name, e.g. A. If not
+      // found, lookup by the rewritten name A/(A) in case the name is both
+      // a namespace and a node name.
+      let nodeName = nodeStats.node_name in graph.nodes ? nodeStats.node_name :
+                                                          nodeStats.node_name +
+              NAMESPACE_DELIM + '(' + nodeStats.node_name + ')';
+
+      // Couldn't find a matching node.
+      if (!(nodeName in graph.nodes)) {
+        return;
+      }
+
+      // Compute the total bytes used.
+      let totalBytes = 0;
+      if (nodeStats.memory) {
+        _.each(nodeStats.memory, alloc => {
+        if (alloc.total_bytes) {
+            if (alloc.total_bytes > 0) {
+              totalBytes += Number(alloc.total_bytes);
+            } else {
+              /* tslint:disable */
+              console.log(
+                  'ignoring negative memory allocation for ' + nodeName);
+              /* tslint:enable */
+            }
+          }
+        });
+      }
+      let outputSize: number[][] = null;
+      if (nodeStats.output) {
+        outputSize = _.map(nodeStats.output, output => {
+          return _.map(output.tensor_description.shape.dim,
+              dim => Number(dim.size));
+        });
+      }
+      graph.nodes[nodeName].device = devStats.device;
+      if (graph.nodes[nodeName].stats == null) {
+        graph.nodes[nodeName].stats = new NodeStats(outputSize);
+      }
+      graph.nodes[nodeName].stats.addBytesAllocation(totalBytes);
+      if (nodeStats.all_end_rel_micros) {
+        if (nodeStats.all_end_rel_micros > 0) {
+          graph.nodes[nodeName].stats.addExecutionTime(
+              nodeStats.all_start_micros,
+              nodeStats.all_start_micros + nodeStats.all_end_rel_micros);
+        } else {
+          /* tslint:disable */
+          console.log('ignoring negative runtime for ' + nodeName);
+          /* tslint:enable */
+        }
+      }
+    });
+  });
+}
+
+/**
+ * Execution stats for the node.
+ */
+export class NodeStats {
+  constructor(outputSize: number[][]) { this.outputSize = outputSize; }
+
+  /**
+   * Add the start and end time for a particular kernel execution of this op.
+   * Ops can have multiple kernel executions within the same session run.
+   */
+  addExecutionTime(startTime: number, endTime: number) {
+    if (this.startTime != null) {
+      this.startTime = Math.min(this.startTime, startTime);
+    } else {
+      this.startTime = startTime;
+    }
+    if (this.endTime != null) {
+      this.endTime = Math.max(this.endTime, endTime);
+    } else {
+      this.endTime = endTime;
+    }
+  }
+
+  /**
+   * Add the bytes allocated for a particular kernel execution of this op.
+   * Ops can have multiple kernel executions within the same session run.
+   */
+  addBytesAllocation(totalBytes: number) {
+    if (this.totalBytes != null) {
+      this.totalBytes = Math.max(this.totalBytes, totalBytes);
+    } else {
+      this.totalBytes = totalBytes;
+    }
+  }
+
+  /**
+   * Absolute start time for the very first kernel execution of this op.
+   */
+  startTime: number;
+  /**
+   * Absolute end time for the very last kernel execution of this op.
+   */
+  endTime: number;
+  /**
+   * Total number of bytes used for the node. Sum of all children
+   * if it is a Group node.
+   */
+  totalBytes = 0;
+
+  /**
+   * The shape of each output tensors, if there are any.
+   * Empty if it is a Group node.
+   */
+  outputSize: number[][];
+
+  /**
+   * Combines the specified stats with the current stats.
+   * Modifies the current object. This method is used to
+   * compute aggregate stats for group nodes.
+   */
+  combine(stats: NodeStats): void {
+    if (stats.totalBytes != null) {
+      this.totalBytes += stats.totalBytes;
+    }
+    if (stats.getTotalMicros() != null) {
+      this.addExecutionTime(stats.startTime, stats.endTime);
+    }
+  }
+
+  /**
+   * Total number of compute time in microseconds used for the node.
+   * Sum of all children if it is a Group node. Null if it is unknown.
+   * This method can not be scaffolded under a getter attribute because
+   * ECMAScript 5 does not support getter attributes.
+   */
+  getTotalMicros(): number {
+    if (this.startTime == null || this.endTime == null) {
+      return null;
+    }
+    return this.endTime - this.startTime;
+  }
+}
+
+export class MetanodeImpl implements Metanode {
+  name: string;
+  stats: NodeStats;
+  type: NodeType;
+  depth: number;
+  isGroupNode: boolean;
+  cardinality: number;
+  metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
+  bridgegraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
+  templateId: string;
+  opHistogram: {[op: string]: number};
+  deviceHistogram: {[op: string]: number};
+  parentNode: Node;
+  hasNonControlEdges: boolean;
+  include: InclusionType;
+  nodeAttributes: {[key: string]: any;};
+
+  /** A label object for meta-nodes in the graph hierarchy */
+  constructor(name: string, opt = {}) {
+    this.name = name;
+    this.type = NodeType.META;
+    /** number of levels under this group */
+    this.depth = 1;
+    this.isGroupNode = true;
+    /** # of leaf nodes (including embedded ones) */
+    this.cardinality = 0;
+    /** graph contains metanodes, nodes, edges
+     * and metaedges for main items within this metanode
+     */
+    this.metagraph =
+      createGraph<GroupNode|OpNode, Metaedge>(name, GraphType.META, opt);
+    /** bridgegraph must be constructed lazily-see hierarchy.getBridgegraph() */
+    this.bridgegraph = null;
+    /**
+     * A dictionary that count ops type of nodes in this metanode
+     * (op type => count).
+     */
+    this.opHistogram = {};
+    this.deviceHistogram = {};
+    /** unique id for a metanode of similar subgraph */
+    this.templateId = null;
+    /** Metanode which contains this node, if any */
+    this.parentNode = null;
+    this.hasNonControlEdges = false;
+    this.include = InclusionType.UNSPECIFIED;
+  }
+
+  getFirstChild(): GroupNode|OpNode {
+    return this.metagraph.node(this.metagraph.nodes()[0]);
+  }
+
+  /**
+   * Returns the op node associated with the metanode.
+   * For example, if the metanode is 'sgd', the associated
+   * op node is sgd/(sgd).
+   */
+  getRootOp(): OpNode {
+    let nameSplit = this.name.split('/');
+    let rootOpName = this.name + '/(' + nameSplit[nameSplit.length - 1] + ')';
+    return <OpNode>this.metagraph.node(rootOpName);
+  }
+
+  /**
+   * Return an array of the names of all the leaves (non-GroupNodes) inside
+   * this metanode. This performs a breadth-first search of the tree, so
+   * immediate child leaves will appear earlier in the output array than
+   * descendant leaves.
+   */
+  leaves(): string[] {
+    let leaves = [];
+    let queue = [<Node> this];
+    let metagraph; // Defined here due to a limitation of ES6->5 compilation.
+    while (queue.length) {
+      let node = queue.shift();
+      if (node.isGroupNode) {
+        metagraph = (<GroupNode> node).metagraph;
+        _.each(metagraph.nodes(), name => queue.push(metagraph.node(name)));
+      } else {
+        leaves.push(node.name);
+      }
+    }
+    return leaves;
+  }
+};
+
+export interface Metaedge extends graphlib.EdgeObject {
+
+  /**
+   * Stores the original BaseEdges represented by this Metaedge.
+   */
+  baseEdgeList: BaseEdge[];
+
+  /**
+   * Whether this edge represents a relationship that is inbound (or outbound)
+   * to the object which contains this information. For example, in a Metanode's
+   * bridgegraph, each edge connects an immediate child to something outside
+   * the Metanode. If the destination of the edge is inside the Metanode, then
+   * its inbound property should be true. If the destination is outside the
+   * Metanode, then its inbound property should be false.
+   *
+   * The property is optional because not all edges can be described as
+   * inbound/outbound. For example, in a Metanode's metagraph, all of the edges
+   * connect immediate children of the Metanode. None should have an inbound
+   * property, or they should be null/undefined.
+   */
+  inbound?: boolean;
+
+  /**
+   * Number of regular edges (not control dependency edges).
+   */
+  numRegularEdges: number;
+
+  /**
+   * Number of control dependency edges.
+   */
+  numControlEdges: number;
+
+  /**
+   * Number of reference edges, which is an edge to an operation
+   * that takes a reference to its input and changes its value.
+   */
+  numRefEdges: number;
+
+  /**
+   * Total size (number of units) of all the tensors flowing through this edge.
+   */
+  totalSize: number;
+
+  addBaseEdge(edge: BaseEdge, h: hierarchy.Hierarchy): void;
+}
+
+export function createMetaedge(v: string, w: string): Metaedge {
+  return new MetaedgeImpl(v, w);
+}
+
+/**
+ * A label object for edges between metanodes of subgraphs in the render graph.
+ */
+export class MetaedgeImpl implements Metaedge {
+  v: string;
+  w: string;
+  baseEdgeList: BaseEdge[];
+  inbound: boolean;
+  numRegularEdges: number;
+  numControlEdges: number;
+  numRefEdges: number;
+  totalSize: number;
+
+  constructor(v: string, w: string) {
+    this.v = v;
+    this.w = w;
+    this.baseEdgeList = [];
+    this.inbound = null;
+    this.numRegularEdges = 0;
+    this.numControlEdges = 0;
+    this.numRefEdges = 0;
+    this.totalSize = 0;
+  }
+
+  addBaseEdge(edge: BaseEdge, h: hierarchy.Hierarchy): void {
+    this.baseEdgeList.push(edge);
+    if (edge.isControlDependency) {
+      this.numControlEdges += 1;
+    } else {
+      this.numRegularEdges += 1;
+    }
+    if (edge.isReferenceEdge) {
+      this.numRefEdges += 1;
+    }
+    // Compute the size of the tensor flowing through this
+    // base edge.
+    this.totalSize += MetaedgeImpl.computeSizeOfEdge(edge, h);
+    h.maxMetaEdgeSize = Math.max(h.maxMetaEdgeSize, this.totalSize);
+  }
+
+  private static computeSizeOfEdge(edge: BaseEdge, h: hierarchy.Hierarchy):
+      number {
+    let opNode = <OpNode> h.node(edge.v);
+    if (opNode.outputShapes == null) {
+      // No shape information. Asssume a single number. This gives
+      // a lower bound for the total size.
+      return 1;
+    }
+    h.hasShapeInfo = true;
+    // Sum the sizes of all output tensors.
+    return _(opNode.outputShapes).map(shape => {
+      // If the shape is unknown, treat it as 1 when computing
+      // total size. This gives a lower bound for the total size.
+      if (shape == null) {
+        return 1;
+      }
+      // Multiply all shapes to get the total size of the tensor.
+      // E.g. The total size of [4, 2, 1] is 4 * 2 * 1.
+      return _(shape).reduce((accumulated, currSize) => {
+        // If this particular dimension is unknown, treat
+        // it as 1 when computing total size. This gives a lower bound
+        // for the total size.
+        if (currSize === -1) {
+          currSize = 1;
+        }
+        return accumulated * currSize;
+      }, 1);
+    }).sum();
+  }
+}
+
+export function createSeriesNode(prefix: string, suffix: string,
+    parent: string, clusterId: number, name: string): SeriesNode {
+  return new SeriesNodeImpl(prefix, suffix, parent, clusterId, name);
+}
+
+export function getSeriesNodeName(prefix: string, suffix: string,
+    parent: string, startId?: number, endId?: number): string {
+  let numRepresentation =
+      (typeof startId !== 'undefined' && typeof endId !== 'undefined') ?
+      '[' + startId + '-' + endId + ']' :
+      '#';
+  let pattern = prefix + numRepresentation + suffix;
+  return (parent ? parent + '/' : '') + pattern;
+}
+
+class SeriesNodeImpl implements SeriesNode {
+  name: string;
+  type: NodeType;
+  stats: NodeStats;
+  hasLoop: boolean;
+  prefix: string;
+  suffix: string;
+  clusterId: number;
+  ids: number[];
+  parent: string;
+  isGroupNode: boolean;
+  cardinality: number;
+  metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
+  bridgegraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
+  parentNode: Node;
+  deviceHistogram: {[op: string]: number};
+  hasNonControlEdges: boolean;
+  include: InclusionType;
+  nodeAttributes: {[key: string]: any;};
+
+  constructor(prefix: string, suffix: string, parent: string,
+      clusterId: number, name: string) {
+    this.name = name || getSeriesNodeName(prefix, suffix, parent);
+    this.type = NodeType.SERIES;
+    this.hasLoop = false;
+    this.prefix = prefix;
+    this.suffix = suffix;
+    this.clusterId = clusterId;
+    this.ids = [];
+    this.parent = parent;
+    this.isGroupNode = true;
+    this.cardinality = 0;
+    this.metagraph = createGraph<Metanode, Metaedge>(name, GraphType.SERIES);
+    // bridgegraph must be constructed lazily-see hierarchy.getBridgegraph()
+    this.bridgegraph = null;
+    this.parentNode = null;
+    this.deviceHistogram = {};
+    this.hasNonControlEdges = false;
+    this.include = InclusionType.UNSPECIFIED;
+  }
+}
+
+/**
+ * Extracts the shapes of the output tensors from the attr property in the
+ * node proto.
+ */
+// tslint:disable-next-line:no-any
+function extractOutputShapes(attr: Array<{key: string, value: any}>):
+    TensorShape[] {
+  let result = null;
+  // We don't know anything about the output tensors.
+  if (!attr) {
+    return null;
+  }
+  for (let i = 0; i < attr.length; i++) {
+    let {key, value} = attr[i];
+    if (key === OUTPUT_SHAPES_KEY) {
+      if (!value.list.shape) {
+        // The OUTPUT_SHAPES_KEY lacks a value. We know nothing about the shape.
+        return null;
+      }
+
+      // Map all output tensors into array of numbers denoting their shape.
+      let result = value.list.shape.map(shape => {
+        if (shape.unknown_rank) {
+          // This output tensor is of unknown rank. We don't know if it is a
+          // scalar, or a tensor, or of what shape it is.
+          return null;
+        }
+        if (shape.dim == null ||
+            (shape.dim.length === 1 && shape.dim[0].size == null)) {
+          // This output tensor is a scalar.
+          return [];
+        }
+        // This output tensor has a known rank. Map each dimension size
+        // into a number.
+        return shape.dim.map(dim => {
+          // Size can be -1 if this particular dimension is unknown.
+          return dim.size;
+        });
+      });
+      // Since we already processed it, remove the entry from the attribute
+      // list (saves memory).
+      attr.splice(i, 1);
+      return result;
+    }
+  }
+  // We didn't find OUTPUT_SHAPES_KEY in attributes, so we don't know anything
+  // about the output tensors.
+  return null;
+}
+
+/**
+ * Extracts the XLA Cluster that an op runs on from the attrs of the OpNode.
+ * @param attr The attr property.
+ * @return A string that is the name of the cluster. Or null if it could not be
+ *     determined.
+ */
+// tslint:disable-next-line:no-any
+function extractXlaCluster(attr: Array<{key: string, value: any}>): string|
+    null {
+  if (!attr) {
+    return null;
+  }
+
+  // Find the attribute for XLA cluster if there is one.
+  for (let i = 0; i < attr.length; i++) {
+    if (attr[i].key === _XLA_CLUSTER_KEY) {
+      return attr[i].value['s'] || null;
+    }
+  }
+  return null;
+}
+
+/**
+ * Normalizes the inputs and extracts associated metadata:
+ * 1) Inputs can contain a colon followed by a number at the end
+ *    (e.g. inputName:1) and we remove this from the input name, and take note
+ *    that the input was numbered.
+ * 2) Control dependency inputs contain caret at the beginning and we
+ *    remove this and annotate the edge as a control dependency.
+ * @param inputs Array of unnormalized names of input nodes.
+ */
+function normalizeInputs(inputs: string[]): NormalizedInput[] {
+  let normalizedInputs: NormalizedInput[] = [];
+  _.each(inputs, inputName => {
+    let start = inputName[0] === '^';
+    let colon = inputName.lastIndexOf(':');
+    let end = colon !== -1 &&
+      inputName.length - colon > 1 &&
+      !(/\D/).test(inputName.substring(colon + 1)) ?
+      colon : inputName.length;
+    let name = inputName.substring(start ? 1 : 0, end);
+    if (normalizedInputs.length === 0 ||
+      name !== normalizedInputs[normalizedInputs.length - 1].name) {
+      normalizedInputs.push({
+        name: name,
+        outputTensorIndex:
+            end === inputName.length ? 0 : Number(inputName.slice(colon + 1)),
+        isControlDependency: start
+      });
+    }
+  });
+  return normalizedInputs;
+}
+
+function addEdgeToGraph(
+    graph: SlimGraph, inputName: string, outputNode: OpNode,
+    input: NormalizedInput, params: BuildParams, index: number) {
+  // Don't allow loops in the graph.
+  if (inputName === outputNode.name) {
+    return;
+  }
+  // Check if this op type and input number corresponds to a
+  // reference edge using the refEdges dictionary in the params.
+  let isRefEdge = params.refEdges[outputNode.op + ' ' + index] === true;
+  graph.edges.push({
+    v: inputName,
+    w: outputNode.name,
+    outputTensorIndex: input.outputTensorIndex,
+    isControlDependency: input.isControlDependency,
+    isReferenceEdge: isRefEdge
+  });
+}
+
+export function build(
+    rawNodes: tf.graph.proto.NodeDef[], params: BuildParams,
+    tracker: ProgressTracker): Promise<SlimGraph|void> {
+  /**
+   * A dictionary that maps each in-embedding node name to the node
+   * object.
+   */
+  let inEmbedding: {[nodeName: string]: OpNode} = {};
+  /**
+   * A dictionary that maps each out-embedding node name to the node
+   * object.
+   */
+  let outEmbedding: {[nodeName: string]: OpNode} = {};
+  /**
+   * A dictionary that maps each node name to an array of the node's
+   * out-embedding node label objects.
+   */
+  let outEmbeddings: {[inputName: string]: OpNode[]} = {};
+  let isInEmbeddedPred = getEmbedPredicate(params.inEmbeddingTypes);
+  let isOutEmbeddedPred = getEmbedPredicate(params.outEmbeddingTypes);
+  let embeddingNodeNames: string[] = [];
+  /**
+   * A list of all the non-embedding node names which appear in the processed
+   * list of raw nodes. Here we pre-allocate enough room for all the rawNodes,
+   * even though there will some number of embeddings. The excess array length
+   * is spliced off later.
+   *
+   * Experimentation shows that around 30% of the array will go unused, and
+   * even for very large networks that amounts to less than 10k spaces.
+   */
+  let nodeNames = new Array<string>(rawNodes.length);
+
+  return tf.graph.util
+      .runAsyncTask(
+          'Normalizing names', 30,
+          () => {
+            let opNodes = new Array<OpNode>(rawNodes.length);
+            let index = 0;
+            _.each(rawNodes, rawNode => {
+              let opNode = new OpNodeImpl(rawNode);
+              if (isInEmbeddedPred(opNode)) {
+                embeddingNodeNames.push(opNode.name);
+                inEmbedding[opNode.name] = opNode;
+                return;
+              }
+
+              if (isOutEmbeddedPred(opNode)) {
+                embeddingNodeNames.push(opNode.name);
+                outEmbedding[opNode.name] = opNode;
+                _.each(opNode.inputs, input => {
+                  let inputName = input.name;
+                  outEmbeddings[inputName] = outEmbeddings[inputName] || [];
+                  outEmbeddings[inputName].push(opNode);
+                });
+                return;
+              }
+              // The node is not an embedding, so add it to the names and nodes
+              // lists.
+              opNodes[index] = opNode;
+              nodeNames[index] = opNode.name;
+              index++;
+            });
+            opNodes.splice(index);
+            nodeNames.splice(index);
+            return opNodes;
+          },
+          tracker)
+      .then((opNodes) => {
+        // Create the graph data structure from the graphlib library.
+        return tf.graph.util.runAsyncTask(
+            'Building the data structure', 70, () => {
+              let normalizedNameDict =
+                  mapStrictHierarchy(nodeNames, embeddingNodeNames);
+              let graph = new SlimGraph;
+
+              // Add the nodes to the graph.
+              _.each(opNodes, opNode => {
+                let normalizedName =
+                    normalizedNameDict[opNode.name] || opNode.name;
+                graph.nodes[normalizedName] = opNode;
+                // Check if the node has out-embeddings. If yes, add them to the
+                // node.
+                if (opNode.name in outEmbeddings) {
+                  opNode.outEmbeddings = outEmbeddings[opNode.name];
+                  // Normalize the names of the out-embeddings.
+                  _.each(opNode.outEmbeddings, node => {
+                    node.name = normalizedNameDict[node.name] || node.name;
+                  });
+                }
+                // Update the name of the node.
+                opNode.name = normalizedName;
+              });
+
+              // Visit each node's inputs to add the edges to the graph. If the
+              // input
+              // is an in-embedding, then add it to the node's in-embeddings
+              // instead.
+              _.each(opNodes, opNode => {
+                _.each(opNode.inputs, (input, i) => {
+                  let inputName = input.name;
+                  if (inputName in inEmbedding) {
+                    let inEmbedNode = inEmbedding[inputName];
+                    opNode.inEmbeddings.push(inEmbedNode);
+                    // Move the inputs of the in-embedding node into incoming
+                    // edges of
+                    // the main node. E.g. the control dependency of a constant
+                    // node
+                    // should be moved to the op node where the constant is
+                    // embedded.
+                    for (let embedInput of inEmbedNode.inputs) {
+                      addEdgeToGraph(
+                          graph, normalizedNameDict[embedInput.name] ||
+                              embedInput.name,
+                          opNode, embedInput, params, i);
+                    }
+                  } else if (inputName in outEmbedding) {
+                    // Move the inputs of the out-embedding node into inputs of
+                    // the main node where the out-embedding points to.
+                    let outEmbedNode = outEmbedding[inputName];
+                    for (let embedInput of outEmbedNode.inputs) {
+                      addEdgeToGraph(
+                          graph, normalizedNameDict[embedInput.name] ||
+                              embedInput.name,
+                          opNode, input, params, i);
+                    }
+                  } else {
+                    addEdgeToGraph(
+                        graph, normalizedNameDict[inputName] || inputName,
+                        opNode, input, params, i);
+                  }
+                });
+              });
+
+              // Normalize the names of in-embeddings.
+              _.each(inEmbedding, (node, name) => {
+                node.name = normalizedNameDict[node.name] || node.name;
+              });
+
+              return graph;
+            }, tracker);
+      });
+};
+
+/**
+ * Create a new graphlib.Graph() instance with default parameters
+ */
+export function createGraph<N, E>(name: string, type, opt = {}):
+    graphlib.Graph<N, E> {
+  let graph = new graphlib.Graph<N, E>(opt);
+  graph.setGraph({
+    name: name,
+    rankdir: 'BT',  // BT,TB,LR,RL
+    type: type
+  });
+  return graph;
+};
+
+/**
+ * Create a predicate for checking whether a node should be embedded based on
+ * the specified types.
+ */
+function getEmbedPredicate(types: string[]) {
+  return function(node: OpNode) {
+    // check types
+    for (let i = 0; i < types.length; i++) {
+      let regExp = new RegExp(types[i]);
+      if (node.op.match(regExp)) { return true; }
+    }
+    return false;
+  };
+};
+
+/**
+ * Returns a strict node name (name => name/(name)) to avoid conflicts
+ * where the node name is also a namespace.
+ */
+export function getStrictName(name: string): string {
+  let parts = name.split(NAMESPACE_DELIM);
+  return name + NAMESPACE_DELIM + '(' + parts[parts.length - 1] + ')';
+}
+
+/**
+ * For each op node (embedding or non-embedding), rename it if there is a
+ * non-embedding node under its namespace. For example, assume node name 'A'.
+ * If there is a non-embedding node under its namespace (e.g. 'A/B'), 'A' will
+ * be renamed to 'A/(A)'. Then the namespace 'A' will contain 2 nodes: '(A)'
+ * and 'B'. If all the nodes under 'A' are embedding nodes (e.g. constant and
+ * summary), keep 'A' as an Op node and don't create a namespace.
+ *
+ * @param nodeNames An array of regular (non-embedding) node names.
+ * @param embeddingNodeNames An array of embedding node names.
+ * @return Dictionary object mapping names that need to be renamed to
+ *     new names.
+ */
+function mapStrictHierarchy(nodeNames: string[],
+    embeddingNodeNames: string[]): {[oldName: string]: string} {
+  /** Dictionary that maps the old new to the new name */
+  let newNameDictionary: {[oldName: string]: string} = {};
+  /** Set used to store all namespaces. */
+  let namespaceSet: {[namespace: string]: boolean} = {};
+  // sort the nodes to make prefix check faster
+  nodeNames.sort();
+  // look for nodes with a prefix a,a/b -> a/(a),a/b
+  for (let i = 0; i < nodeNames.length - 1; ++i) {
+    let a = nodeNames[i];
+    // Get all the parent namespaces of the current node
+    // and add them in the namespace set.
+    _.each(getHierarchicalPath(a).slice(0, -1), ns => {
+      namespaceSet[ns] = true;
+    });
+    for (let j = i + 1; j < nodeNames.length; ++j) {
+      let b = nodeNames[j];
+      if (_.startsWith(b, a)) {
+        if (b.length > a.length && b.charAt(a.length) === NAMESPACE_DELIM) {
+          newNameDictionary[a] = getStrictName(a);
+          break;
+        }
+      } else {
+        break;
+      }
+    }
+  }
+  // Go through all the embedding node names and rename them in case they
+  // collide with namespaces.
+  _.each(embeddingNodeNames, embeddingName => {
+    if (embeddingName in namespaceSet) {
+      // Rename to follow strict hierarchy.
+      newNameDictionary[embeddingName] = getStrictName(embeddingName);
+    }
+  });
+  return newNameDictionary;
+};
+
+/**
+ * Returns a list of the degrees of each node in the graph.
+ */
+function degreeSequence(graph: graphlib.Graph<any, any>): number[] {
+  let degrees = graph.nodes().map(function(name) {
+    return graph.neighbors(name).length;
+  });
+  degrees.sort();
+  return degrees;
+};
+
+/**
+ * Returns if the degree sequence of the two graphs is the same.
+ */
+export function hasSimilarDegreeSequence(graph1: graphlib.Graph<any, any>,
+    graph2: graphlib.Graph<any, any>): boolean {
+  let dg1 = degreeSequence(graph1);
+  let dg2 = degreeSequence(graph2);
+
+  for (let i = 0; i < dg1.length; i++) {
+    if (dg1[i] !== dg2[i]) {
+      return false;
+    }
+  }
+  return true;
+};
+
+/**
+ * Returns the hierarchical path of the current node, based on the node's name.
+ * For example, if the name is 'a/b/c', the returned path is
+ * ['a', 'a/b', 'a/b/c'].
+ */
+export function getHierarchicalPath(name: string,
+  seriesNames?: { [name: string]: string }): string[] {
+  let path: string[] = [];
+  let i = name.indexOf(NAMESPACE_DELIM);
+  // Push all parent portions of the path.
+  while (i >= 0) {
+    path.push(name.substring(0, i));
+    i = name.indexOf(NAMESPACE_DELIM, i + 1);
+  }
+  // If the node's path is under a series, then add the series node name to the
+  // hierarchical path as the parent of the leaf.
+  if (seriesNames) {
+    let seriesName = seriesNames[name];
+    if (seriesName) {
+      path.push(seriesName);
+    }
+  }
+  // Push the leaf of the path.
+  path.push(name);
+  return path;
+};
+
+/**
+ * Returns the string for the node inclusion toggle button, dependant
+ * on the provided current InclusionType.
+ */
+export function getIncludeNodeButtonString(include: InclusionType) {
+  if (include === tf.graph.InclusionType.EXCLUDE) {
+    return 'Add to main graph';
+  } else {
+    return 'Remove from main graph';
+  }
+};
+
+/**
+ * Returns the string for the series node grouping toggle button, dependant
+ * on the provided current SeriesGroupingType.
+ */
+export function getGroupSeriesNodeButtonString(group: SeriesGroupingType) {
+  if (group === tf.graph.SeriesGroupingType.GROUP) {
+    return 'Ungroup this series of nodes';
+  } else {
+    return 'Group this series of nodes';
+  }
+};
+
+/**
+ * Toggle the node series grouping option in the provided map, setting it
+ * to ungroup if the series is not already in the map.
+ */
+export function toggleNodeSeriesGroup(
+  map: { [name: string]: tf.graph.SeriesGroupingType }, name: string) {
+  if (!(name in map) || map[name] === tf.graph.SeriesGroupingType.GROUP) {
+    map[name] = tf.graph.SeriesGroupingType.UNGROUP;
+  } else {
+    map[name] = tf.graph.SeriesGroupingType.GROUP;
+  }
+};
+
+} // close module tf.graph
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/hierarchy.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/hierarchy.ts
new file mode 100644
index 0000000000000000000000000000000000000000..889607ac5006bf75c698f7d121e1e0b6f9da6e8e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/hierarchy.ts
@@ -0,0 +1,807 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/**
+ * Package for the Graph Hierarchy for TensorFlow graph.
+ */
+module tf.graph.hierarchy {
+
+/**
+ * Class used as output for getPredecessors and getSuccessors methods
+ */
+export interface Edges {
+  control: Metaedge[];
+  regular: Metaedge[];
+}
+
+export interface Hierarchy {
+  root: Metanode;
+  templates: {[templateId: string]: string[]};
+  /** List of all device names */
+  devices: string[];
+  /** List of all XLA cluster names */
+  xlaClusters: string[];
+  /** True if at least one tensor in the graph has shape information */
+  hasShapeInfo: boolean;
+  /** The maximum size across all meta edges. Used for scaling thickness. */
+  maxMetaEdgeSize: number;
+  getNodeMap(): {[nodeName: string]: GroupNode|OpNode};
+  node(name: string): GroupNode|OpNode;
+  setNode(name: string, node: GroupNode|OpNode): void;
+  getBridgegraph(nodeName: string): graphlib.Graph<GroupNode|OpNode, Metaedge>;
+  getPredecessors(nodeName: string): Edges;
+  getSuccessors(nodeName: string): Edges;
+  getTopologicalOrdering(nodeName: string): { [childName: string]: number };
+  getTemplateIndex(): (string) => number;
+}
+
+/**
+ * Class for the Graph Hierarchy for TensorFlow graph.
+ */
+class HierarchyImpl implements Hierarchy {
+  root: Metanode;
+  templates: {[templateId: string]: string[]};
+  private index: {[nodeName: string]: GroupNode|OpNode};
+  devices: string[];
+  xlaClusters: string[];
+  hasShapeInfo = false;
+  maxMetaEdgeSize = 1;
+  orderings: { [nodeName: string]: { [childName: string]: number } };
+
+  constructor() {
+    this.root = createMetanode(ROOT_NAME, {compound: true});
+    this.templates = null;
+    this.devices = null;
+    /**
+     * @type {Object} Dictionary object that maps node name to the node
+     * (could be op-node, metanode, or series-node)
+     */
+    this.index = {};
+    this.index[ROOT_NAME] = this.root;
+    this.orderings = {};
+  }
+
+  getNodeMap(): {[nodeName: string]: GroupNode|OpNode} {
+    return this.index;
+  }
+
+  node(name: string): GroupNode|OpNode {
+    return this.index[name];
+  }
+
+  setNode(name: string, node: GroupNode|OpNode): void {
+    this.index[name] = node;
+  }
+
+  /**
+   * Given the name of a node in this hierarchy, get its bridgegraph, creating
+   * it on the fly if necessary. If the node is not a GroupNode, then this
+   * method returns null. If the provided name does not map to a node in the
+   * hierarchy, an error will be thrown.
+   */
+  getBridgegraph(nodeName: string): graphlib.Graph<GroupNode|OpNode, Metaedge> {
+    let node = this.index[nodeName];
+    if (!node) {
+      throw Error('Could not find node in hierarchy: ' + nodeName);
+    }
+    if (!('metagraph' in node)) {
+      return null;
+    }
+    let groupNode = <GroupNode> node;
+    if (groupNode.bridgegraph) {
+      return groupNode.bridgegraph;
+    }
+    let bridgegraph = groupNode.bridgegraph =
+        createGraph<GroupNode|OpNode, Metaedge>(
+            'BRIDGEGRAPH', GraphType.BRIDGE);
+    if (!node.parentNode || !('metagraph' in node.parentNode)) {
+      return bridgegraph;
+    }
+
+    let parentNode = <GroupNode>node.parentNode;
+    let parentMetagraph = parentNode.metagraph;
+    let parentBridgegraph = this.getBridgegraph(parentNode.name);
+
+    // For each of the parent node's two Metaedge containing graphs, process
+    // each Metaedge involving this node.
+    _.each([parentMetagraph, parentBridgegraph], parentGraph => {
+      _(parentGraph.edges())
+        .filter(e => e.v === nodeName || e.w === nodeName)
+        .each(parentEdgeObj => {
+
+          let inbound = parentEdgeObj.w === nodeName;
+          let parentMetaedge = parentGraph.edge(parentEdgeObj);
+
+          // The parent's Metaedge represents some number of underlying
+          // BaseEdges from the original full graph. For each of those, we need
+          // to determine which immediate child is involved and make sure
+          // there's a Metaedge in the bridgegraph that covers it.
+          _.each(parentMetaedge.baseEdgeList, baseEdge => {
+
+            // Based on the direction, figure out which is the descendant node
+            // and which is the 'other' node (sibling of parent or ancestor).
+            let [descendantName, otherName] =
+              inbound ?
+                [baseEdge.w, parentEdgeObj.v] :
+                [baseEdge.v, parentEdgeObj.w];
+
+            // Determine the immediate child containing this descendant node.
+            let childName = this.getChildName(nodeName, descendantName);
+
+            // Look for an existing Metaedge in the bridgegraph (or create a
+            // new one) that covers the relationship between child and other.
+            let bridgeEdgeObj = <graphlib.EdgeObject> {
+              v: inbound ? otherName : childName,
+              w: inbound ? childName : otherName,
+            };
+            let bridgeMetaedge = bridgegraph.edge(bridgeEdgeObj);
+            if (!bridgeMetaedge) {
+              bridgeMetaedge = createMetaedge(bridgeEdgeObj.v, bridgeEdgeObj.w);
+              bridgeMetaedge.inbound = inbound;
+              bridgegraph.setEdge(bridgeEdgeObj.v, bridgeEdgeObj.w,
+                  bridgeMetaedge);
+            }
+
+            // Copy the BaseEdge from the parent's Metaedge into this
+            // bridgegraph Metaedge.
+            bridgeMetaedge.addBaseEdge(baseEdge, this);
+          });
+        })
+        .value(); // force lodash chain execution.
+    });
+
+    return bridgegraph;
+  }
+
+  /**
+   * Utility function for determining the name of the immediate child under a
+   * node for a given descendant path. If the descendant corresponds to no
+   * immediate child, an error is thrown.
+   */
+  getChildName(nodeName: string, descendantName: string): string {
+    // Walk up the hierarchy from the descendant to find the child.
+    let currentNode: Node = this.index[descendantName];
+    while (currentNode) {
+      if (currentNode.parentNode && currentNode.parentNode.name === nodeName) {
+        return currentNode.name;
+      }
+      currentNode = currentNode.parentNode;
+    }
+    throw Error(
+        'Could not find immediate child for descendant: ' + descendantName);
+  };
+
+  /** Given the name of a node, return its incoming metaedges. */
+  getPredecessors(nodeName: string): Edges {
+    let node = this.index[nodeName];
+    if (!node) {
+      throw Error('Could not find node with name: ' + nodeName);
+    }
+
+    let predecessors = this.getOneWayEdges(node, true);
+    // Add embedded predecessors, such as constants.
+    if (!node.isGroupNode) {
+      _.each((<OpNode>node).inEmbeddings, embeddedNode => {
+        _.each((<OpNode>node).inputs, input => {
+          if (input.name === embeddedNode.name) {
+            // Make a new metaedge holding the edge between the
+            // node and the in-embedding.
+            let metaedge = new MetaedgeImpl(embeddedNode.name, nodeName);
+            metaedge.addBaseEdge(
+                {
+                  isControlDependency: input.isControlDependency,
+                  outputTensorIndex: input.outputTensorIndex,
+                  isReferenceEdge: false,
+                  v: embeddedNode.name,
+                  w: nodeName
+                },
+                this);
+            predecessors.regular.push(metaedge);
+          }
+        });
+      });
+    }
+    return predecessors;
+  }
+
+  /**
+   * Given the name of a node, return its outgoing metaedges.
+   *
+   * This is the inverse of getPredecessors(). See that method's documentation
+   * for an in-depth example.
+   */
+  getSuccessors(nodeName: string): Edges {
+    let node = this.index[nodeName];
+    if (!node) {
+      throw Error('Could not find node with name: ' + nodeName);
+    }
+
+    let successors = this.getOneWayEdges(node, false);
+
+    // Add embedded successors, such as summaries.
+    if (!node.isGroupNode) {
+      _.each((<OpNode>node).outEmbeddings, embeddedNode => {
+        _.each(embeddedNode.inputs, input => {
+          if (input.name === nodeName) {
+            // Make a new metaedge holding the edge between the
+            // node and the out-embedding.
+            let metaedge = new MetaedgeImpl(nodeName, embeddedNode.name);
+            metaedge.addBaseEdge(
+                {
+                  isControlDependency: input.isControlDependency,
+                  outputTensorIndex: input.outputTensorIndex,
+                  isReferenceEdge: false,
+                  v: nodeName,
+                  w: embeddedNode.name
+                },
+                this);
+            successors.regular.push(metaedge);
+          }
+        });
+      });
+    }
+    return successors;
+  }
+
+  /** Helper method for getPredecessors and getSuccessors */
+  getOneWayEdges(node: GroupNode|OpNode, inEdges: boolean) {
+    let edges: Edges = {control: [], regular: []};
+    // A node with no parent cannot have any edges.
+    if (!node.parentNode || !node.parentNode.isGroupNode) {
+      return edges;
+    }
+    let parentNode = <GroupNode> node.parentNode;
+    let metagraph = parentNode.metagraph;
+    let bridgegraph = this.getBridgegraph(parentNode.name);
+    findEdgeTargetsInGraph(metagraph, node, inEdges, edges);
+    findEdgeTargetsInGraph(bridgegraph, node, inEdges, edges);
+    return edges;
+  }
+
+  /**
+   * For a given GroupNode, get or calculate an object which describes a
+   * topological ordering of child nodes within that GroupNode's metagraph.
+   *
+   * This ordering is used when rendering bridge control edges which are
+   * sometimes backwards relative to the dataflow.
+   *
+   * For example, say we have a graph with two edges A->B and A->C, and we're
+   * interested in the ordering under ROOT. In this case, any of the following
+   * would be legitimate return values:
+   *
+   *  - { 'A': 0, 'B': 1, 'C': 2 } -- most likely
+   *  - { 'A': 0, 'B': 2, 'C': 1 } -- less likely
+   *  - { 'A': 12, 'B': 100, 'C': 99 } -- unlikely, but still OK
+   *
+   * The algorithm does not guarantee that all numbers from 0-N (where N is
+   * the number of nodes) appear exactly once. Rather it guarantees that if
+   * there is a path between two nodes, the earlier one will have a lower
+   * number in the ordering hash.
+   *
+   * When generating the ordering, we ignore control Metaedges (those which
+   * represent only BaseEdges that have isControlDependency set to true).
+   *
+   * If there is no node with the specified name, an error is thrown. If the
+   * node with the specified name is not a group node, null is returned.
+   */
+  getTopologicalOrdering(nodeName: string): { [childName: string]: number } {
+    let node = this.index[nodeName];
+    if (!node) {
+      throw Error('Could not find node with name: ' + nodeName);
+    }
+    if (!node.isGroupNode) {
+      return null;
+    }
+    if (nodeName in this.orderings) {
+      return this.orderings[nodeName];
+    }
+
+    // Mapping of a child node names to lists of their successors.
+    let successors: { [childName: string]: string[] } = {};
+
+    // Set of node names which have appeared as a destination.
+    let destinations: { [childName: string]: boolean } = {};
+
+    let metagraph = (<GroupNode> node).metagraph;
+    _.each(metagraph.edges(), (e: graphlib.EdgeObject) => {
+      if (!metagraph.edge(e).numRegularEdges) {
+        return; // Skip control edges.
+      }
+
+      // Keep track of successors and destinations.
+      if (!(e.v in successors)) {
+        successors[e.v] = [];
+      }
+      successors[e.v].push(e.w);
+      destinations[e.w] = true;
+    });
+
+    // Seed the queue with true sources (those that are not destinations).
+    let queue: string[] =
+      _.difference(_.keys(successors), _.keys(destinations));
+
+    // Produce an ordering by traversing the graph breadth first.
+    let ordering = this.orderings[nodeName] = {};
+    let index = 0;
+    while (queue.length) {
+      let childName = queue.shift();
+      ordering[childName] = index++;
+      _.each(successors[childName], succName => queue.push(succName));
+      delete successors[childName]; // Prevent cycles from infinite looping.
+    }
+    return ordering;
+  }
+
+  /**
+   * Returns a d3 Ordinal function that can be used to look up the index of
+   * a node based on its template id.
+   */
+  getTemplateIndex(): (string) => number {
+    let templateNames = d3.keys(this.templates);
+    let templateIndex = d3.scaleOrdinal()
+        .domain(templateNames)
+        .range(d3.range(0, templateNames.length));
+    return (templateId: string) => <number>templateIndex(templateId);
+  }
+}
+
+/**
+ * Internal utility function - given a graph (should be either a metagraph or a
+ * bridgegraph) and a node which is known to be in that graph, determine
+ * the other ends of edges that involve that node in the direction specified
+ * by whether it's inbound.
+ *
+ * For example if you wanted to find the predecessors of a node, you'd call
+ * this method for the parent's metagraph and bridgegraph, specifying inbound
+ * as true (look at the source of inbound edges to the specified node).
+ *
+ * Discovered target names are appended to the targets array.
+ */
+function findEdgeTargetsInGraph(
+    graph: graphlib.Graph<GroupNode|OpNode, Metaedge>,
+    node: Node, inbound: boolean, targets: Edges): void {
+  let edges = inbound ? graph.inEdges(node.name) : graph.outEdges(node.name);
+  _.each(edges, e => {
+    let metaedge = graph.edge(e);
+    let targetList =
+        metaedge.numRegularEdges ? targets.regular : targets.control;
+    targetList.push(metaedge);
+  });
+}
+
+export interface HierarchyParams {
+  verifyTemplate: boolean;
+  seriesNodeMinSize: number;
+  seriesMap: { [name: string]: tf.graph.SeriesGroupingType };
+}
+
+/**
+ * @param graph The raw graph.
+ * @param params Parameters used when building a hierarchy.
+ */
+export function build(graph: tf.graph.SlimGraph, params: HierarchyParams,
+    tracker: ProgressTracker): Promise<Hierarchy|void> {
+  let h = new HierarchyImpl();
+  let seriesNames: { [name: string]: string } = {};
+  return tf.graph.util
+      .runAsyncTask(
+          'Adding nodes', 20,
+          () => {
+            // Get all the possible device and XLA cluster names.
+            let deviceNames = {};
+            let xlaClusterNames = {};
+            _.each(graph.nodes, (node, nodeName) => {
+              if (node.device) {
+                deviceNames[node.device] = true;
+              }
+
+              if (node.xlaCluster) {
+                xlaClusterNames[node.xlaCluster] = true;
+              }
+            });
+
+            h.devices = _.keys(deviceNames);
+            h.xlaClusters = _.keys(xlaClusterNames);
+
+            addNodes(h, graph);
+          },
+          tracker)
+      .then(() => {
+        return tf.graph.util.runAsyncTask('Detect series', 20, () => {
+          if (params.seriesNodeMinSize > 0) {
+            groupSeries(
+                h.root, h, seriesNames, params.seriesNodeMinSize,
+                params.seriesMap);
+          }
+        }, tracker);
+      })
+      .then(() => {
+        return tf.graph.util.runAsyncTask('Adding edges', 30, () => {
+          addEdges(h, graph, seriesNames);
+        }, tracker);
+      })
+      .then(() => {
+        return tf.graph.util.runAsyncTask(
+            'Finding similar subgraphs', 30, () => {
+              h.templates = template.detect(h, params.verifyTemplate);
+            }, tracker);
+      })
+      .then(() => {
+        return h;
+      });
+};
+
+export function joinAndAggregateStats(
+    h: Hierarchy, stats: tf.graph.proto.StepStats) {
+  // Get all the possible device names.
+  let deviceNames = {};
+  _.each(h.root.leaves(), nodeName => {
+    let leaf = <OpNode> h.node(nodeName);
+    if (leaf.device != null) {
+      deviceNames[leaf.device] = true;
+    }
+  });
+  h.devices = _.keys(deviceNames);
+
+  // Reset stats for each group node.
+  _.each(h.getNodeMap(), (node, nodeName) => {
+    if (node.isGroupNode) {
+      node.stats = new NodeStats(null);
+      (<GroupNode>node).deviceHistogram = {};
+    }
+  });
+
+  // Bubble-up the stats and device distribution from leaves to parents.
+  _.each(h.root.leaves(), nodeName => {
+    let leaf = <OpNode> h.node(nodeName);
+    let node = <GroupNode|OpNode> leaf;
+    while (node.parentNode != null) {
+      if (leaf.device != null) {
+        let deviceHistogram = (<GroupNode>node.parentNode).deviceHistogram;
+        deviceHistogram[leaf.device] = (deviceHistogram[leaf.device] || 0) + 1;
+      }
+      if (leaf.stats != null) {
+        node.parentNode.stats.combine(leaf.stats);
+      }
+      node = <GroupNode> node.parentNode;
+    }
+  });
+}
+
+/**
+ * Creates the metanodes in the hierarchical graph and assigns parent-child
+ * relationship between them.
+ */
+function addNodes(h: Hierarchy, graph: SlimGraph) {
+  _.each(graph.nodes, (node, nodeName) => {
+    let path = getHierarchicalPath(node.name);
+    let parent: Metanode = h.root;
+
+    parent.depth = Math.max(path.length, parent.depth);
+
+    // Create parent metanodes for each depth. For example if the node name
+    // is 'a/b/c', then create metanodes 'a' and 'a/b', where 'a/b' is a child
+    // of a.
+    for (let i = 0; i < path.length; i++) {
+      parent.depth = Math.max(parent.depth, path.length - i);
+      parent.cardinality += node.cardinality;
+      parent.opHistogram[node.op] = (parent.opHistogram[node.op] || 0) + 1;
+      if (node.device != null) {
+        parent.deviceHistogram[node.device] =
+            (parent.deviceHistogram[node.device] || 0) + 1;
+      }
+      if (i === path.length - 1) { break; }
+      let name = path[i];
+      let child = <Metanode>h.node(name);
+      if (!child) {
+        child = createMetanode(name);
+        child.parentNode = parent;
+        h.setNode(name, child);
+        parent.metagraph.setNode(name, child);
+      }
+      parent = child;
+    }
+    // Assuming node name is 'a/b/c', assign the OpNode as a child of the
+    // metanode 'a/b'.
+    h.setNode(node.name, node);
+    node.parentNode = parent;
+    parent.metagraph.setNode(node.name, node);
+
+    // Add each of the in-embeddings and out-embeddings in the hierarchy.
+    _.each(node.inEmbeddings, function(embedding) {
+      h.setNode(embedding.name, embedding);
+      embedding.parentNode = node;
+    });
+    _.each(node.outEmbeddings, function(embedding) {
+      h.setNode(embedding.name, embedding);
+      embedding.parentNode = node;
+    });
+  });
+};
+
+/**
+ * For each metanode in the hierarchical graph, this method adds:
+ * the edges in the metagraph. These are edges between nodes
+ * that share the same parent.
+ */
+function addEdges(h: Hierarchy, graph: SlimGraph,
+    seriesNames: { [name: string]: string }) {
+
+  let nodeIndex = h.getNodeMap();
+
+  // Ancestor paths for the source and destination nodes of an edge. These are
+  // reused for each edge rather than allocating new ones. It's about 10% faster
+  // than allocating new ones on each pass through the loop.
+  let sourcePath: string[] = [];
+  let destPath: string[] = [];
+
+  // Insert the ancestor path for a node into the provided array, including the
+  // node itself. Return the index of the last node inserted (always ROOT).
+  let getPath = (node: Node, path: string[]): number => {
+    let i = 0;
+    while (node) {
+      path[i++] = node.name;
+      node = node.parentNode;
+    }
+    return i - 1;
+  };
+
+  _.each(graph.edges, baseEdge => {
+
+    // Get the hierarchical paths for the source and destination of the edge.
+    let sourceAncestorIndex = getPath(graph.nodes[baseEdge.v], sourcePath);
+    let destAncestorIndex = getPath(graph.nodes[baseEdge.w], destPath);
+
+    // If the hierarchical path cannot be found for either endpoint, then we
+    // cannot create the edge. This happens for example when a node has a
+    // control dependency on a summary node, which are embedded.
+    if (sourceAncestorIndex === -1 || destAncestorIndex === -1) {
+      return;
+    }
+
+    // Find the lowest shared ancestor between source and dest by looking for
+    // the highest nodes that differ between their ancestor paths.
+    while (sourcePath[sourceAncestorIndex] === destPath[destAncestorIndex]) {
+      sourceAncestorIndex--;
+      destAncestorIndex--;
+      if (sourceAncestorIndex < 0 || destAncestorIndex < 0) {
+        // This would only occur if the two nodes were the same (a cycle in the
+        // graph), or if one endpoint was a strict ancestor of the other. The
+        // latter shouldn't happen because we rename nodes which are both
+        // metanodes and op nodes. E.g. 'A/B' becomes 'A/B/(B)'.
+        throw Error('No difference found between ancestor paths.');
+      }
+    }
+
+    let sharedAncestorNode =
+      <GroupNode>nodeIndex[sourcePath[sourceAncestorIndex + 1]];
+    let sourceAncestorName = sourcePath[sourceAncestorIndex];
+    let destAncestorName = destPath[destAncestorIndex];
+
+    // Find or create the Metaedge which should contain this BaseEdge inside
+    // the shared ancestor.
+    let metaedge =
+      sharedAncestorNode.metagraph.edge(sourceAncestorName, destAncestorName);
+    if (!metaedge) {
+      metaedge = createMetaedge(sourceAncestorName, destAncestorName);
+      sharedAncestorNode.metagraph
+        .setEdge(sourceAncestorName, destAncestorName, metaedge);
+    }
+    if (!sharedAncestorNode.hasNonControlEdges &&
+        !baseEdge.isControlDependency) {
+      sharedAncestorNode.hasNonControlEdges = true;
+    }
+    metaedge.addBaseEdge(baseEdge, h);
+  });
+};
+
+/**
+ * Using the hierarchy template information, detect series in the provided
+ * metanode.  For each detected series, create a new SeriesNode
+ * and remove series members from the metanode's metagraph and move them to
+ * the new series node's metagraph.
+ *
+ * @param metanode
+ * @param hierarchy
+ * @param seriesNames Map of node names to their series they are contained in.
+ *     This should be provided empty and is populated by this method.
+ * @param threshold If the series has this many nodes or more, then group them
+ *     into a series.
+ * @param map Map of series names to their series grouping type, if one has
+ *     been set.
+ * @return A dictionary from node name to series node name that contains the
+ *     node.
+ */
+function groupSeries(metanode: Metanode, hierarchy: Hierarchy,
+    seriesNames: { [name: string]: string }, threshold: number,
+    map: { [name: string]: tf.graph.SeriesGroupingType }) {
+  let metagraph = metanode.metagraph;
+  _.each(metagraph.nodes(), n => {
+    let child = metagraph.node(n);
+    if (child.type === tf.graph.NodeType.META) {
+      groupSeries(<Metanode>child, hierarchy, seriesNames, threshold, map);
+    }
+  });
+
+  let clusters = clusterNodes(metagraph);
+  let seriesDict = detectSeries(clusters, metagraph);
+
+  // Add each series node to the graph and add its grouped children to its own
+  // metagraph.
+  _.each(seriesDict, function(seriesNode: SeriesNode, seriesName: string) {
+    let nodeMemberNames = seriesNode.metagraph.nodes();
+    _.each(nodeMemberNames, n => {
+      let child = <OpNode>metagraph.node(n);
+      if (!child.owningSeries) {
+        child.owningSeries = seriesName;
+      }
+    });
+    // If the series contains less than the threshold number of nodes and
+    // this series has not been adding to the series map, then set this
+    // series to be shown ungrouped in the map.
+    if (nodeMemberNames.length < threshold && !(seriesNode.name in map)) {
+      map[seriesNode.name] = tf.graph.SeriesGroupingType.UNGROUP;
+    }
+    // If the series is in the map as ungrouped then do not group the series.
+    if (seriesNode.name in map
+      && map[seriesNode.name] === tf.graph.SeriesGroupingType.UNGROUP) {
+      return;
+    }
+    hierarchy.setNode(seriesName, seriesNode); // add to the index
+    metagraph.setNode(seriesName, seriesNode);
+    _.each(nodeMemberNames, n => {
+      let child = <OpNode> metagraph.node(n);
+      seriesNode.metagraph.setNode(n, child);
+      seriesNode.parentNode = child.parentNode;
+      seriesNode.cardinality++;
+      if (child.device != null) {
+        seriesNode.deviceHistogram[child.device] =
+            (seriesNode.deviceHistogram[child.device] || 0) + 1;
+      }
+      child.parentNode = seriesNode;
+      seriesNames[n] = seriesName;
+      // Remove now-grouped node from its original parent's metagraph.
+      metagraph.removeNode(n);
+    });
+  });
+};
+
+/** cluster op-nodes with similar op */
+function clusterNodes(metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>):
+    {[clusterId: string]: string[]} {
+  let result: {[clusterId: string]: string[]} = {};
+  return  _.reduce(metagraph.nodes(),
+      (clusters: {[clusterId: string]: string[]}, n: string) => {
+    let child = metagraph.node(n);
+    if (child.type === NodeType.META) {
+      // skip metanodes
+      return clusters;
+    }
+    let template = (<OpNode>child).op;
+    if (template) {
+      clusters[template] = clusters[template] || [];
+      clusters[template].push(child.name);
+    }
+    return clusters;
+  }, result);
+}
+
+/**
+ * For each cluster of op-nodes based op type, try to detect groupings.
+ * Infer series name using by trying to find pattern '<number>' in the node
+ * name.
+ *
+ * @param clusters Dictionary output from clusterNodes().
+ * @param metagraph
+ * @return A dictionary from series name => seriesNode
+ */
+function detectSeries(clusters: {[clusterId: string]: string[]},
+     metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>):
+     {[seriesName: string]: SeriesNode} {
+  let seriesDict: {[seriesName: string]: SeriesNode} = {};
+  _.each(clusters, function(members, clusterId: string) {
+    if (members.length <= 1) { return; } // isolated clusters can't make series
+
+    /** @type {Object}  A dictionary mapping seriesName to seriesInfoArray,
+     * which is an array that contains objects with name, id, prefix, suffix,
+     * and parent properties.
+     */
+    let candidatesDict: {[seriesName: string]: SeriesNode[]} = {};
+
+    // Group all nodes that have the same name, with the exception of a
+    // number at the end of the name after an underscore, which is allowed to
+    // vary.
+    _.each(members, function(name: string) {
+      let isGroup = name.charAt(name.length - 1) === '*';
+      let namepath = name.split('/');
+      let leaf = namepath[namepath.length - 1];
+      let parent = namepath.slice(0, namepath.length - 1).join('/');
+      let matches = leaf.match(/^(\D*)_(\d+)$/);
+
+      let prefix;
+      let id;
+      let suffix = '';
+      if (matches) {         // if found '<number>' in the name, assign id.
+        prefix = matches[1]; // the front non-numeric characters
+        id = matches[2]; // the digits
+      } else {  // for node without '_<number>', make them zero-th items.
+        prefix = isGroup ? leaf.substr(0, leaf.length - 1) : leaf;
+        id = 0;
+        suffix = isGroup ? '*' : '';
+      }
+      let seriesName = getSeriesNodeName(prefix, suffix, parent);
+      candidatesDict[seriesName] = candidatesDict[seriesName] || [];
+      let seriesNode = createSeriesNode(prefix, suffix, parent, +id, name);
+      candidatesDict[seriesName].push(seriesNode);
+    });
+
+    // In each group of nodes, group nodes in bunches that have monotonically
+    // increasing numbers in their names.  Each of these bunches is a series.
+    _.each(candidatesDict, function(seriesInfoArray: SeriesNode[], seriesName) {
+      if (seriesInfoArray.length < 2) {
+        return;
+      }
+      seriesInfoArray.sort(function(a, b) {
+        return (+a.clusterId) - (+b.clusterId);
+      });
+
+      // Loop through the nodes sorted by its detected series number, grouping
+      // all nodes with monotonically-increasing series numbers.
+      let seriesNodes = [seriesInfoArray[0]];
+      for (let index = 1; index < seriesInfoArray.length; index++) {
+        let nextNode = seriesInfoArray[index];
+        if (nextNode.clusterId === seriesNodes[seriesNodes.length - 1].clusterId
+            + 1) {
+          seriesNodes.push(nextNode);
+          continue;
+        }
+        addSeriesToDict(seriesNodes, seriesDict, +clusterId, metagraph);
+        seriesNodes = [nextNode];
+      }
+      addSeriesToDict(seriesNodes, seriesDict, +clusterId, metagraph);
+    });
+  });
+  return seriesDict;
+}
+
+/**
+ * Add a series to the provided dictionary mapping series names to series.
+ *
+ * @param seriesNodes the nodes in the series. Contains
+ *     name, id, prefix, suffix and parent properties of the node.
+ * @param seriesDict the dictionary of series
+ * @param clusterId ID of the template of the nodes of the series
+ * @param metagraph
+ */
+function addSeriesToDict(seriesNodes: SeriesNode[],
+    seriesDict: {[seriesName: string]: SeriesNode},
+    clusterId: number,
+    metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>) {
+  if (seriesNodes.length > 1) {
+    let curSeriesName = getSeriesNodeName(
+      seriesNodes[0].prefix, seriesNodes[0].suffix,
+      seriesNodes[0].parent, seriesNodes[0].clusterId,
+      seriesNodes[seriesNodes.length - 1].clusterId);
+    let curSeriesNode = createSeriesNode(seriesNodes[0].prefix,
+      seriesNodes[0].suffix, seriesNodes[0].parent, clusterId,
+      curSeriesName);
+    _.each(seriesNodes, function(node) {
+      curSeriesNode.ids.push(node.clusterId);
+      curSeriesNode.metagraph.setNode(node.name, metagraph.node(node.name));
+    });
+    seriesDict[curSeriesName] = curSeriesNode;
+  }
+}
+
+} // close module tf.graph.hierarchy
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/layout.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/layout.ts
new file mode 100644
index 0000000000000000000000000000000000000000..11f41cfdd08a06bf02fcab0951cf4a02082b6f01
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/layout.ts
@@ -0,0 +1,758 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.graph.layout {
+
+/** Set of parameters that define the look and feel of the graph. */
+export const PARAMS = {
+  animation: {
+    /** Default duration for graph animations in ms. */
+    duration: 250
+  },
+  graph: {
+    /** Graph parameter for metanode. */
+    meta: {
+      /**
+       * Dagre's nodesep param - number of pixels that
+       * separate nodes horizontally in the layout.
+       *
+       * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
+       */
+      nodeSep: 5,
+      /**
+       * Dagre's ranksep param - number of pixels
+       * between each rank in the layout.
+       *
+       * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
+       */
+      rankSep: 25,
+      /**
+       * Dagre's edgesep param - number of pixels that separate
+       * edges horizontally in the layout.
+       */
+      edgeSep: 5,
+    },
+    /** Graph parameter for metanode. */
+    series: {
+      /**
+       * Dagre's nodesep param - number of pixels that
+       * separate nodes horizontally in the layout.
+       *
+       * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
+       */
+      nodeSep: 5,
+      /**
+       * Dagre's ranksep param - number of pixels
+       * between each rank in the layout.
+       *
+       * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
+       */
+      rankSep: 25,
+      /**
+       * Dagre's edgesep param - number of pixels that separate
+       * edges horizontally in the layout.
+       */
+      edgeSep: 5
+    },
+    /**
+     * Padding is used to correctly position the graph SVG inside of its parent
+     * element. The padding amounts are applied using an SVG transform of X and
+     * Y coordinates.
+     */
+    padding: {paddingTop: 40, paddingLeft: 20}
+  },
+  subscene: {
+    meta: {
+      paddingTop: 10,
+      paddingBottom: 10,
+      paddingLeft: 10,
+      paddingRight: 10,
+      /**
+       * Used to leave room for the label on top of the highest node in
+       * the core graph.
+       */
+      labelHeight: 20,
+      /** X-space between each extracted node and the core graph. */
+      extractXOffset: 15,
+      /** Y-space between each extracted node. */
+      extractYOffset: 20
+    },
+    series: {
+      paddingTop: 10,
+      paddingBottom: 10,
+      paddingLeft: 10,
+      paddingRight: 10,
+      labelHeight: 10
+    }
+  },
+  nodeSize: {
+    /** Size of meta nodes. */
+    meta: {
+      radius: 5,
+      width: 60,
+      maxLabelWidth: 52,
+      /** A scale for the node's height based on number of nodes inside */
+      height: d3.scaleLinear().domain([1, 200]).range([15, 60]).clamp(true),
+      /** The radius of the circle denoting the expand button. */
+      expandButtonRadius: 3
+    },
+    /** Size of op nodes. */
+    op: {
+      width: 15,
+      height: 6,
+      radius: 3,  // for making annotation touching ellipse
+      labelOffset: -8,
+      maxLabelWidth: 30
+    },
+    /** Size of series nodes. */
+    series: {
+      expanded: {
+        // For expanded series nodes, width and height will be
+        // computed to account for the subscene.
+        radius: 10,
+        labelOffset: 0,
+      },
+      vertical: {
+        // When unexpanded, series whose underlying metagraphs contain
+        // one or more non-control edges will show as a vertical stack
+        // of ellipses.
+        width: 16,
+        height: 13,
+        labelOffset: -13,
+      },
+      horizontal: {
+        // When unexpanded, series whose underlying metagraphs contain
+        // no non-control edges will show as a horizontal stack of
+        // ellipses.
+        width: 24,
+        height: 8,
+        radius: 10,  // Forces annotations to center line.
+        labelOffset: -10,
+      },
+    },
+    /** Size of bridge nodes. */
+    bridge: {
+      // NOTE: bridge nodes will normally be invisible, but they must
+      // take up some space so that the layout step leaves room for
+      // their edges.
+      width: 20,
+      height: 20,
+      radius: 2,
+      labelOffset: 0
+    }
+  },
+  shortcutSize: {
+    /** Size of shortcuts for op nodes */
+    op: {width: 10, height: 4},
+    /** Size of shortcuts for meta nodes */
+    meta: {width: 12, height: 4, radius: 1},
+    /** Size of shortcuts for series nodes */
+    series: {
+      width: 14,
+      height: 4,
+    }
+  },
+  annotations: {
+    /** Maximum possible width of the bounding box for in annotations */
+    inboxWidth: 50,
+    /** Maximum possible width of the bounding box for out annotations */
+    outboxWidth: 50,
+    /** X-space between the shape and each annotation-node. */
+    xOffset: 10,
+    /** Y-space between each annotation-node. */
+    yOffset: 3,
+    /** X-space between each annotation-node and its label. */
+    labelOffset: 2,
+    /** Defines the max width for annotation label */
+    maxLabelWidth: 120
+  },
+  constant: {size: {width: 4, height: 4}},
+  series: {
+    /** Maximum number of repeated item for unexpanded series node. */
+    maxStackCount: 3,
+    /**
+     * Positioning offset ratio for collapsed stack
+     * of parallel series (series without edges between its members).
+     */
+    parallelStackOffsetRatio: 0.2,
+    /**
+     * Positioning offset ratio for collapsed stack
+     * of tower series (series with edges between its members).
+     */
+    towerStackOffsetRatio: 0.5
+  },
+  minimap: {
+    /** The maximum width/height the minimap can have. */
+    size: 150
+  }
+};
+
+/** Calculate layout for a scene of a group node. */
+export function layoutScene(renderNodeInfo: render.RenderGroupNodeInfo): void {
+  // Update layout, size, and annotations of its children nodes and edges.
+  if (renderNodeInfo.node.isGroupNode) {
+    layoutChildren(renderNodeInfo);
+  }
+
+  // Update position of its children nodes and edges
+  if (renderNodeInfo.node.type === NodeType.META) {
+    layoutMetanode(renderNodeInfo);
+  } else if (renderNodeInfo.node.type === NodeType.SERIES) {
+    layoutSeriesNode(renderNodeInfo);
+  }
+};
+
+/**
+ * Updates the total width of an unexpanded node which includes the size of its
+ * in and out annotations.
+ */
+function updateTotalWidthOfNode(renderInfo: render.RenderNodeInfo): void {
+  renderInfo.inboxWidth = renderInfo.inAnnotations.list.length > 0 ?
+      PARAMS.annotations.inboxWidth : 0;
+  renderInfo.outboxWidth = renderInfo.outAnnotations.list.length > 0 ?
+      PARAMS.annotations.outboxWidth : 0;
+  // Assign the width of the core box (the main shape of the node).
+  renderInfo.coreBox.width = renderInfo.width;
+  renderInfo.coreBox.height = renderInfo.height;
+  // TODO(jimbo): Account for font width rather than using a magic number.
+  let labelLength = renderInfo.node.name.length -
+      renderInfo.node.name.lastIndexOf(NAMESPACE_DELIM) - 1;
+  let charWidth = 3; // 3 pixels per character.
+  // Compute the total width of the node.
+  renderInfo.width = Math.max(renderInfo.coreBox.width +
+      renderInfo.inboxWidth + renderInfo.outboxWidth,
+      labelLength * charWidth);
+
+}
+
+/**
+ * Update layout, size, and annotations of its children nodes and edges.
+ */
+function layoutChildren(renderNodeInfo: render.RenderGroupNodeInfo): void {
+  let children = renderNodeInfo.coreGraph.nodes().map(n => {
+    return renderNodeInfo.coreGraph.node(n);
+  }).concat(renderNodeInfo.isolatedInExtract,
+      renderNodeInfo.isolatedOutExtract);
+
+  _.each(children, childNodeInfo => {
+    // Set size of each child
+    switch (childNodeInfo.node.type) {
+      case NodeType.OP:
+        _.extend(childNodeInfo, PARAMS.nodeSize.op);
+        break;
+      case NodeType.BRIDGE:
+        _.extend(childNodeInfo, PARAMS.nodeSize.bridge);
+        break;
+      case NodeType.META:
+        if (!childNodeInfo.expanded) {
+          // Set fixed width and scalable height based on cardinality
+          _.extend(childNodeInfo, PARAMS.nodeSize.meta);
+          childNodeInfo.height =
+              PARAMS.nodeSize.meta.height(childNodeInfo.node.cardinality);
+        } else {
+          let childGroupNodeInfo =
+            <render.RenderGroupNodeInfo>childNodeInfo;
+          layoutScene(childGroupNodeInfo); // Recursively layout its subscene.
+        }
+        break;
+      case NodeType.SERIES:
+        if (childNodeInfo.expanded) {
+          _.extend(childNodeInfo, PARAMS.nodeSize.series.expanded);
+          let childGroupNodeInfo =
+            <render.RenderGroupNodeInfo>childNodeInfo;
+          layoutScene(childGroupNodeInfo); // Recursively layout its subscene.
+        } else {
+          let childGroupNodeInfo =
+            <render.RenderGroupNodeInfo>childNodeInfo;
+          let seriesParams =
+            childGroupNodeInfo.node.hasNonControlEdges ?
+              PARAMS.nodeSize.series.vertical :
+              PARAMS.nodeSize.series.horizontal;
+          _.extend(childNodeInfo, seriesParams);
+        }
+        break;
+      default:
+        throw Error('Unrecognized node type: ' + childNodeInfo.node.type);
+    }
+    // Compute total width of un-expanded nodes. Width of expanded nodes
+    // has already been computed.
+    if (!childNodeInfo.expanded) {
+      updateTotalWidthOfNode(childNodeInfo);
+    }
+    // Layout each child's annotations
+    layoutAnnotation(childNodeInfo);
+  });
+}
+
+/**
+ * Calculate layout for a graph using dagre
+ * @param graph the graph to be laid out
+ * @param params layout parameters
+ * @return width and height of the core graph
+ */
+function dagreLayout(
+    graph: graphlib.Graph<render.RenderNodeInfo, render.RenderMetaedgeInfo>,
+    params): {height: number, width: number} {
+  _.extend(graph.graph(), {
+    nodesep: params.nodeSep,
+    ranksep: params.rankSep,
+    edgesep: params.edgeSep
+  });
+  let bridgeNodeNames = [];
+  let nonBridgeNodeNames = [];
+
+  // Split out nodes into bridge and non-bridge nodes, and calculate the total
+  // width we should use for bridge nodes.
+  _.each(graph.nodes(), nodeName => {
+    let nodeInfo = graph.node(nodeName);
+    if (nodeInfo.node.type === NodeType.BRIDGE) {
+      bridgeNodeNames.push(nodeName);
+    } else {
+      nonBridgeNodeNames.push(nodeName);
+    }
+  });
+
+  // If there are no non-bridge nodes, then the graph has zero size.
+  if (!nonBridgeNodeNames.length) {
+    return {
+      width: 0,
+      height: 0,
+    };
+  }
+  dagre.layout(graph);
+
+  // Calculate the true bounding box of the graph by iterating over nodes and
+  // edges rather than accepting dagre's word for it. In particular, we should
+  // ignore the extra-wide bridge nodes and bridge edges, and allow for
+  // annotation boxes and labels.
+  let minX = Infinity;
+  let minY = Infinity;
+  let maxX = -Infinity;
+  let maxY = -Infinity;
+  _.each(nonBridgeNodeNames, nodeName => {
+    let nodeInfo = graph.node(nodeName);
+    let w = 0.5 * nodeInfo.width;
+    let x1 = nodeInfo.x - w;
+    let x2 = nodeInfo.x + w;
+    minX = x1 < minX ? x1 : minX;
+    maxX = x2 > maxX ? x2 : maxX;
+    // TODO(jimbo): Account for the height of labels above op nodes here.
+    let h = 0.5 * nodeInfo.height;
+    let y1 = nodeInfo.y - h;
+    let y2 = nodeInfo.y + h;
+    minY = y1 < minY ? y1 : minY;
+    maxY = y2 > maxY ? y2 : maxY;
+  });
+  _.each(graph.edges(), edgeObj => {
+    let edgeInfo = graph.edge(edgeObj);
+    if (edgeInfo.structural) {
+      return; // Skip structural edges from min/max calculations.
+    }
+
+    // Since the node size passed to dagre includes the in and out
+    // annotations, the endpoints of the edge produced by dagre may not
+    // point to the actual node shape (rectangle, ellipse). We correct the
+    // end-points by finding the intersection of a line between the
+    // next-to-last (next-to-first) point and the destination (source)
+    // rectangle.
+    let sourceNode = graph.node(edgeInfo.metaedge.v);
+    let destNode = graph.node(edgeInfo.metaedge.w);
+
+    // Straight 3-points edges are special case, since they are curved after
+    // our default correction. To keep them straight, we remove the mid point
+    // and correct the first and the last point to be the center of the
+    // source and destination node respectively.
+    if (edgeInfo.points.length === 3 && isStraightLine(edgeInfo.points)) {
+      if (sourceNode != null) {
+        let cxSource = sourceNode.expanded ?
+            sourceNode.x : computeCXPositionOfNodeShape(sourceNode);
+        edgeInfo.points[0].x = cxSource;
+      }
+      if (destNode != null) {
+        let cxDest = destNode.expanded ?
+            destNode.x : computeCXPositionOfNodeShape(destNode);
+        edgeInfo.points[2].x = cxDest;
+      }
+      // Remove the middle point so the edge doesn't curve.
+      edgeInfo.points = [edgeInfo.points[0], edgeInfo.points[1]];
+    }
+    // Correct the destination endpoint of the edge.
+    let nextToLastPoint = edgeInfo.points[edgeInfo.points.length - 2];
+    // The destination node might be null if this is a bridge edge.
+    if (destNode != null) {
+      edgeInfo.points[edgeInfo.points.length - 1] =
+          intersectPointAndNode(nextToLastPoint, destNode);
+    }
+    // Correct the source endpoint of the edge.
+    let secondPoint = edgeInfo.points[1];
+    // The source might be null if this is a bridge edge.
+    if (sourceNode != null) {
+      edgeInfo.points[0] = intersectPointAndNode(secondPoint, sourceNode);
+    }
+
+    _.each(edgeInfo.points, (point: render.Point) => {
+        minX = point.x < minX ? point.x : minX;
+        maxX = point.x > maxX ? point.x : maxX;
+        minY = point.y < minY ? point.y : minY;
+        maxY = point.y > maxY ? point.y : maxY;
+      });
+  });
+
+  // Shift all nodes and edge points to account for the left-padding amount,
+  // and the invisible bridge nodes.
+  _.each(graph.nodes(), nodeName => {
+    let nodeInfo = graph.node(nodeName);
+    nodeInfo.x -= minX;
+    nodeInfo.y -= minY;
+  });
+  _.each(graph.edges(), edgeObj => {
+    _.each(graph.edge(edgeObj).points, (point: render.Point) => {
+        point.x -= minX;
+        point.y -= minY;
+      });
+  });
+
+  return {
+    width: maxX - minX,
+    height: maxY - minY
+  };
+}
+
+/** Layout a metanode. Only called for an expanded node. */
+function layoutMetanode(renderNodeInfo: render.RenderGroupNodeInfo): void {
+  // First, copy params specific to meta nodes onto this render info object.
+  let params = PARAMS.subscene.meta;
+  _.extend(renderNodeInfo, params);
+  // Invoke dagre.layout() on the core graph and record the bounding box
+  // dimensions.
+  _.extend(renderNodeInfo.coreBox,
+      dagreLayout(renderNodeInfo.coreGraph, PARAMS.graph.meta));
+
+  // Calculate the position of nodes in isolatedInExtract relative to the
+  // top-left corner of inExtractBox (the bounding box for all inExtract nodes)
+  // and calculate the size of the inExtractBox.
+  let maxInExtractWidth = _.max(renderNodeInfo.isolatedInExtract,
+      renderNode => renderNode.width).width;
+  renderNodeInfo.inExtractBox.width = maxInExtractWidth != null ?
+      maxInExtractWidth : 0;
+
+  renderNodeInfo.inExtractBox.height =
+    _.reduce(renderNodeInfo.isolatedInExtract, (height, child, i) => {
+      let yOffset = i > 0 ? params.extractYOffset : 0;
+      // use width/height here to avoid overlaps between extracts
+      child.x = 0;
+      child.y = height + yOffset + child.height / 2;
+      return height + yOffset + child.height;
+    }, 0);
+
+  // Calculate the position of nodes in isolatedOutExtract relative to the
+  // top-left corner of outExtractBox (the bounding box for all outExtract
+  // nodes) and calculate the size of the outExtractBox.
+  let maxOutExtractWidth = _.max(renderNodeInfo.isolatedOutExtract,
+      renderNode => renderNode.width).width;
+  renderNodeInfo.outExtractBox.width = maxOutExtractWidth != null ?
+      maxOutExtractWidth : 0;
+
+  renderNodeInfo.outExtractBox.height =
+    _.reduce(renderNodeInfo.isolatedOutExtract, (height, child, i) => {
+      let yOffset = i > 0 ? params.extractYOffset : 0;
+      // use width/height here to avoid overlaps between extracts
+      child.x = 0;
+      child.y = height + yOffset + child.height / 2;
+      return height + yOffset + child.height;
+    }, 0);
+
+  // Compute the total padding between the core graph, in-extract and
+  // out-extract boxes.
+  let numParts = 0;
+  if (renderNodeInfo.isolatedInExtract.length > 0) {
+    numParts++;
+  }
+  if (renderNodeInfo.isolatedOutExtract.length > 0) {
+    numParts++;
+  }
+  if (renderNodeInfo.coreGraph.nodeCount() > 0) {
+    numParts++;
+  }
+  let offset = PARAMS.subscene.meta.extractXOffset;
+  let padding = numParts <= 1 ? 0 : (numParts  <= 2 ? offset : 2 * offset);
+
+  // Add the in-extract and out-extract width to the core box width.
+  renderNodeInfo.coreBox.width += renderNodeInfo.inExtractBox.width +
+      renderNodeInfo.outExtractBox.width + padding;
+  renderNodeInfo.coreBox.height =
+    params.labelHeight +
+    Math.max(
+      renderNodeInfo.inExtractBox.height,
+      renderNodeInfo.coreBox.height,
+      renderNodeInfo.outExtractBox.height
+  );
+  // Determine the whole metanode's width (from left to right).
+  renderNodeInfo.width = renderNodeInfo.coreBox.width +
+      params.paddingLeft + params.paddingRight;
+
+  // Determine the whole metanode's height (from top to bottom).
+  renderNodeInfo.height =
+      renderNodeInfo.paddingTop +
+      renderNodeInfo.coreBox.height +
+      renderNodeInfo.paddingBottom;
+}
+
+/**
+ * Calculate layout for series node's core graph. Only called for an expanded
+ * series.
+ */
+function layoutSeriesNode(node: render.RenderGroupNodeInfo): void {
+  let graph = node.coreGraph;
+
+  let params = PARAMS.subscene.series;
+  _.extend(node, params);
+
+  // Layout the core.
+  _.extend(node.coreBox, dagreLayout(node.coreGraph, PARAMS.graph.series));
+
+  _.each(graph.nodes(), nodeName => {
+    graph.node(nodeName).excluded = false;
+  });
+
+  // Series do not have in/outExtractBox so no need to include them here.
+  node.width = node.coreBox.width + params.paddingLeft + params.paddingRight;
+  node.height = node.coreBox.height + params.paddingTop + params.paddingBottom;
+}
+
+/**
+ * Calculate layout for annotations of a given node.
+ * This will modify positions of the given node and its annotations.
+ *
+ * @see tf.graph.render.Node and tf.graph.render.Annotation
+ * for description of each property of each render node.
+ *
+ */
+function layoutAnnotation(renderNodeInfo: render.RenderNodeInfo): void {
+  // If the render node is an expanded metanode, then its annotations will not
+  // be visible and we should skip the annotation calculations.
+  if (renderNodeInfo.expanded) {
+    return;
+  }
+
+  let inAnnotations = renderNodeInfo.inAnnotations.list;
+  let outAnnotations = renderNodeInfo.outAnnotations.list;
+
+  // Calculate size for in-annotations
+  _.each(inAnnotations, a => sizeAnnotation(a));
+
+  // Calculate size for out-annotations
+  _.each(outAnnotations, a => sizeAnnotation(a));
+
+  let params = PARAMS.annotations;
+
+  // Calculate annotation node position (a.dx, a.dy)
+  // and total height for in-annotations
+  // After this chunk of code:
+  // inboxHeight = sum of annotation heights+ (annotation.length - 1 * yOffset)
+  let inboxHeight = _.reduce(inAnnotations,
+      (height, a, i) => {
+        let yOffset = i > 0 ? params.yOffset : 0;
+        a.dx = -(renderNodeInfo.coreBox.width + a.width) / 2 - params.xOffset;
+        a.dy = height + yOffset + a.height / 2;
+        return height + yOffset + a.height;
+      }, 0);
+
+  _.each(inAnnotations, a => {
+    a.dy -= inboxHeight / 2;
+
+    a.labelOffset = params.labelOffset;
+  });
+
+  // Calculate annotation node position (a.dx, a.dy)
+  // and total height for out-annotations
+  // After this chunk of code:
+  // outboxHeight = sum of annotation heights +
+  //                (annotation.length - 1 * yOffset)
+  let outboxHeight = _.reduce(outAnnotations,
+      (height, a, i) => {
+        let yOffset = i > 0 ? params.yOffset : 0;
+        a.dx = (renderNodeInfo.coreBox.width + a.width) / 2 + params.xOffset;
+        a.dy = height + yOffset + a.height / 2;
+        return height + yOffset + a.height;
+      }, 0);
+
+  _.each(outAnnotations, a => {
+    // adjust by (half of ) the total height
+    // so dy is relative to the host node's center.
+    a.dy -= outboxHeight / 2;
+
+    a.labelOffset = params.labelOffset;
+  });
+
+  // Creating scales for touch point between the in-annotation edges
+  // and their hosts.
+
+  let inTouchHeight =
+      Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius,
+          inboxHeight / 2);
+  inTouchHeight = inTouchHeight < 0 ? 0 : inTouchHeight;
+
+  let inY = d3.scaleLinear()
+    .domain([0, inAnnotations.length - 1])
+    .range([-inTouchHeight, inTouchHeight]);
+
+  // Calculate annotation edge position
+  _.each(inAnnotations, (a, i) => {
+    a.points = [
+      // The annotation node end
+      {
+        dx: a.dx + a.width / 2,
+        dy: a.dy
+      },
+
+      // The host node end
+      {
+        dx: - renderNodeInfo.coreBox.width / 2,
+        // only use scale if there are more than one,
+        // otherwise center it vertically
+        dy: inAnnotations.length > 1 ? inY(i) : 0
+      }
+    ];
+  });
+
+  // Creating scales for touch point between the out-annotation edges
+  // and their hosts.
+  let outTouchHeight =
+      Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius,
+          outboxHeight / 2);
+  outTouchHeight = outTouchHeight < 0 ? 0 : outTouchHeight;
+  let outY = d3.scaleLinear()
+    .domain([0, outAnnotations.length - 1])
+    .range([-outTouchHeight, outTouchHeight]);
+
+  _.each(outAnnotations, (a, i) => {
+    // Add point from the border of the annotation node
+    a.points = [
+      // The host node end
+      {
+        dx: renderNodeInfo.coreBox.width / 2,
+        // only use scale if there are more than one,
+        // otherwise center it vertically
+        dy: outAnnotations.length > 1 ? outY(i) : 0
+      },
+      // The annotation node end
+      {
+        dx: a.dx - a.width / 2,
+        dy: a.dy
+      }
+    ];
+  });
+
+  renderNodeInfo.height =
+      Math.max(renderNodeInfo.height, inboxHeight, outboxHeight);
+}
+
+/**
+ * Set size of an annotation node.
+ */
+function sizeAnnotation(a: render.Annotation): void {
+  switch (a.annotationType) {
+    case render.AnnotationType.CONSTANT:
+      _.extend(a, PARAMS.constant.size);
+      break;
+    case render.AnnotationType.SHORTCUT:
+      if (a.node.type === NodeType.OP) {
+        _.extend(a, PARAMS.shortcutSize.op);
+      } else if (a.node.type === NodeType.META) {
+        _.extend(a, PARAMS.shortcutSize.meta);
+      } else if (a.node.type === NodeType.SERIES) {
+        _.extend(a, PARAMS.shortcutSize.series);
+      } else {
+        throw Error('Invalid node type: ' + a.node.type);
+      }
+      break;
+    case render.AnnotationType.SUMMARY:
+      _.extend(a, PARAMS.constant.size);
+      break;
+  }
+}
+
+/**
+ * Determines the center position of the node's shape. The position depends
+ * on if the node has in and out-annotations.
+ */
+export function computeCXPositionOfNodeShape(renderInfo: render.RenderNodeInfo):
+    number {
+  if (renderInfo.expanded) {
+    return renderInfo.x;
+  }
+  let dx = renderInfo.inAnnotations.list.length ? renderInfo.inboxWidth : 0;
+  return renderInfo.x - renderInfo.width / 2 + dx +
+      renderInfo.coreBox.width / 2;
+}
+
+/** Returns the angle (in degrees) between two points. */
+function angleBetweenTwoPoints(a: render.Point, b: render.Point): number {
+  let dx = b.x - a.x;
+  let dy = b.y - a.y;
+  return 180 * Math.atan(dy / dx) / Math.PI;
+}
+
+/**
+ * Returns if a line going through the specified points is a straight line.
+ */
+function isStraightLine(points: render.Point[]) {
+  let angle = angleBetweenTwoPoints(points[0], points[1]);
+  for (let i = 1; i < points.length - 1; i++) {
+    let newAngle = angleBetweenTwoPoints(points[i], points[i + 1]);
+    // Have a tolerance of 1 degree.
+    if (Math.abs(newAngle - angle) > 1) {
+      return false;
+    }
+    angle = newAngle;
+  }
+  return true;
+}
+
+/**
+ * Returns the intersection of a line between the provided point
+ * and the provided rectangle.
+ */
+function intersectPointAndNode(
+    point: render.Point, node: render.RenderNodeInfo): render.Point {
+  // cx and cy are the center of the rectangle.
+  let cx = node.expanded ?
+     node.x : computeCXPositionOfNodeShape(node);
+  let cy = node.y;
+  // Calculate the slope
+  let dx = point.x - cx;
+  let dy = point.y - cy;
+  let w = node.expanded ? node.width : node.coreBox.width;
+  let h = node.expanded ? node.height : node.coreBox.height;
+  let deltaX, deltaY;
+  if (Math.abs(dy) * w / 2  > Math.abs(dx) * h / 2) {
+    // The intersection is above or below the rectangle.
+    if (dy < 0) {
+      h = -h;
+    }
+    deltaX = dy === 0 ? 0 : h / 2 * dx / dy;
+    deltaY = h / 2;
+  } else {
+    // The intersection is left or right of the rectangle.
+    if (dx < 0) {
+      w = -w;
+    }
+    deltaX = w / 2;
+    deltaY = dx === 0 ? 0 : w / 2 * dy / dx;
+  }
+  return {x: cx + deltaX, y: cy + deltaY};
+}
+
+} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/minimap.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/minimap.ts
new file mode 100644
index 0000000000000000000000000000000000000000..9a07323a1d49638e97ad5614d63deb7c17eeb989
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/minimap.ts
@@ -0,0 +1,327 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.scene {
+
+/** Show minimap when the viewpoint area is less than X% of the whole area. */
+const FRAC_VIEWPOINT_AREA: number = 0.8;
+
+export class Minimap {
+  /** The minimap container. */
+  private minimap: HTMLElement;
+  /** The canvas used for drawing the mini version of the svg. */
+  private canvas: HTMLCanvasElement;
+  /** A buffer canvas used for temporary drawing to avoid flickering. */
+  private canvasBuffer: HTMLCanvasElement;
+  private download: HTMLLinkElement;
+  private downloadCanvas: HTMLCanvasElement;
+
+  /** The minimap svg used for holding the viewpoint rectangle. */
+  private minimapSvg: SVGSVGElement;
+  /** The rectangle showing the current viewpoint. */
+  private viewpoint: SVGRectElement;
+  /**
+   * The scale factor for the minimap. The factor is determined automatically
+   * so that the minimap doesn't violate the maximum width/height specified
+   * in the constructor. The minimap maintains the same aspect ratio as the
+   * original svg.
+   */
+  private scaleMinimap: number;
+  /** The main svg element. */
+  private svg: SVGSVGElement;
+  /** The svg group used for panning and zooming the main svg. */
+  private zoomG: SVGGElement;
+  /** The zoom behavior of the main svg. */
+  private mainZoom: d3.ZoomBehavior<any, any>;
+  /** The maximum width and height for the minimap. */
+  private maxWandH: number;
+  /** The last translation vector used in the main svg. */
+  private translate: [number, number];
+  /** The last scaling factor used in the main svg. */
+  private scaleMain: number;
+  /** The coordinates of the viewpoint rectangle. */
+  private viewpointCoord: {x: number, y: number};
+  /** The current size of the minimap */
+  private minimapSize: {width: number, height: number};
+  /** Padding (px) due to the main labels of the graph. */
+  private labelPadding: number;
+  /**
+   * Constructs a new minimap.
+   *
+   * @param svg The main svg element.
+   * @param zoomG The svg group used for panning and zooming the main svg.
+   * @param mainZoom The main zoom behavior.
+   * @param minimap The minimap container.
+   * @param maxWandH The maximum width/height for the minimap.
+   * @param labelPadding Padding in pixels due to the main graph labels.
+   */
+  constructor(svg: SVGSVGElement, zoomG: SVGGElement,
+      mainZoom: d3.ZoomBehavior<any, any>, minimap: HTMLElement,
+      maxWandH: number, labelPadding: number) {
+    this.svg = svg;
+    this.labelPadding = labelPadding;
+    this.zoomG = zoomG;
+    this.mainZoom = mainZoom;
+    this.maxWandH = maxWandH;
+    let $minimap = d3.select(minimap);
+    // The minimap will have 2 main components: the canvas showing the content
+    // and an svg showing a rectangle of the currently zoomed/panned viewpoint.
+    let $minimapSvg = $minimap.select('svg');
+
+    // Make the viewpoint rectangle draggable.
+    let $viewpoint = $minimapSvg.select('rect');
+    let dragmove = (d) => {
+      this.viewpointCoord.x = (<DragEvent>d3.event).x;
+      this.viewpointCoord.y = (<DragEvent>d3.event).y;
+      this.updateViewpoint();
+    };
+    this.viewpointCoord = {x: 0, y: 0};
+    let drag = d3.drag().subject(Object).on('drag', dragmove);
+    $viewpoint.datum(this.viewpointCoord as any).call(drag);
+
+    // Make the minimap clickable.
+    $minimapSvg.on('click', () => {
+      if ((<Event>d3.event).defaultPrevented) {
+        // This click was part of a drag event, so suppress it.
+        return;
+      }
+      // Update the coordinates of the viewpoint.
+      let width = Number($viewpoint.attr('width'));
+      let height = Number($viewpoint.attr('height'));
+      let clickCoords = d3.mouse($minimapSvg.node() as any);
+      this.viewpointCoord.x = clickCoords[0] - width / 2;
+      this.viewpointCoord.y = clickCoords[1] - height / 2;
+      this.updateViewpoint();
+    });
+    this.viewpoint = <SVGRectElement>$viewpoint.node();
+    this.minimapSvg = <SVGSVGElement>$minimapSvg.node();
+    this.minimap = minimap;
+    this.canvas = <HTMLCanvasElement>$minimap.select('canvas.first').node();
+    this.canvasBuffer =
+        <HTMLCanvasElement>$minimap.select('canvas.second').node();
+    this.downloadCanvas =
+        <HTMLCanvasElement>$minimap.select('canvas.download').node();
+    d3.select(this.downloadCanvas).style('display', 'none');
+    this.update();
+  }
+
+  /**
+   * Updates the position and the size of the viewpoint rectangle.
+   * It also notifies the main svg about the new panned position.
+   */
+  private updateViewpoint(): void {
+    // Update the coordinates of the viewpoint rectangle.
+    d3.select(this.viewpoint)
+        .attr('x', this.viewpointCoord.x)
+        .attr('y', this.viewpointCoord.y);
+    // Update the translation vector of the main svg to reflect the
+    // new viewpoint.
+    let mainX = - this.viewpointCoord.x * this.scaleMain / this.scaleMinimap;
+    let mainY = - this.viewpointCoord.y * this.scaleMain / this.scaleMinimap;
+    this.mainZoom.translateBy(d3.select(this.zoomG), mainX, mainY);
+
+  }
+
+  /**
+   * Redraws the minimap. Should be called whenever the main svg
+   * was updated (e.g. when a node was expanded).
+   */
+  update(): void {
+    let sceneSize = null;
+    try {
+      // Get the size of the entire scene.
+      sceneSize = this.zoomG.getBBox();
+      if (sceneSize.width === 0) {
+        // There is no scene anymore. We have been detached from the dom.
+        return;
+      }
+    } catch (e) {
+      // Firefox produced NS_ERROR_FAILURE if we have been
+      // detached from the dom.
+      return;
+    }
+    let $download = d3.select('#graphdownload');
+    this.download = <HTMLLinkElement>$download.node();
+    $download.on('click', d => {
+      this.download.href = this.downloadCanvas.toDataURL('image/png');
+    });
+
+    let $svg = d3.select(this.svg);
+    // Read all the style rules in the document and embed them into the svg.
+    // The svg needs to be self contained, i.e. all the style rules need to be
+    // embedded so the canvas output matches the origin.
+    let stylesText = '';
+    for (let k = 0; k < document.styleSheets.length; k++) {
+      try {
+        let cssRules = (<any>document.styleSheets[k]).cssRules ||
+          (<any>document.styleSheets[k]).rules;
+        if (cssRules == null) {
+          continue;
+        }
+        for (let i = 0; i < cssRules.length; i++) {
+          // Remove tf-* selectors from the styles.
+          stylesText +=
+              cssRules[i].cssText.replace(/ ?tf-[\w-]+ ?/g, '') + '\n';
+        }
+      } catch (e) {
+        if (e.name !== 'SecurityError') {
+          throw e;
+        }
+      }
+    }
+
+    // Temporarily add the css rules to the main svg.
+    let svgStyle = $svg.append('style');
+    svgStyle.text(stylesText);
+
+    // Temporarily remove the zoom/pan transform from the main svg since we
+    // want the minimap to show a zoomed-out and centered view.
+    let $zoomG = d3.select(this.zoomG);
+    let zoomTransform = $zoomG.attr('transform');
+    $zoomG.attr('transform', null);
+
+    // Since we add padding, account for that here.
+    sceneSize.height += this.labelPadding * 2;
+    sceneSize.width += this.labelPadding * 2;
+
+    // Temporarily assign an explicit width/height to the main svg, since
+    // it doesn't have one (uses flex-box), but we need it for the canvas
+    // to work.
+    $svg
+      .attr('width', sceneSize.width)
+      .attr('height', sceneSize.height);
+
+    // Since the content inside the svg changed (e.g. a node was expanded),
+    // the aspect ratio have also changed. Thus, we need to update the scale
+    // factor of the minimap. The scale factor is determined such that both
+    // the width and height of the minimap are <= maximum specified w/h.
+    this.scaleMinimap =
+        this.maxWandH / Math.max(sceneSize.width, sceneSize.height);
+
+    this.minimapSize = {
+      width: sceneSize.width * this.scaleMinimap,
+      height: sceneSize.height * this.scaleMinimap
+    };
+
+    // Update the size of the minimap's svg, the buffer canvas and the
+    // viewpoint rect.
+    d3.select(this.minimapSvg).attr(<any>this.minimapSize);
+    d3.select(this.canvasBuffer).attr(<any>this.minimapSize);
+
+    // Download canvas width and height are multiples of the style width and
+    // height in order to increase pixel density of the PNG for clarity.
+    d3.select(this.downloadCanvas).style(
+      <any>{ width: sceneSize.width, height: sceneSize.height });
+    d3.select(this.downloadCanvas).attr(
+      <any>{ width: sceneSize.width * 3, height: sceneSize.height * 3 });
+
+    if (this.translate != null && this.zoom != null) {
+      // Update the viewpoint rectangle shape since the aspect ratio of the
+      // map has changed.
+      requestAnimationFrame(() => this.zoom());
+    }
+
+    // Serialize the main svg to a string which will be used as the rendering
+    // content for the canvas.
+    let svgXml = (new XMLSerializer()).serializeToString(this.svg);
+
+    // Now that the svg is serialized for rendering, remove the temporarily
+    // assigned styles, explicit width and height and bring back the pan/zoom
+    // transform.
+    svgStyle.remove();
+    $svg.attr('width', null).attr('height', null);
+
+    $zoomG.attr('transform', zoomTransform);
+    let image = new Image();
+    image.onload = () => {
+      // Draw the svg content onto the buffer canvas.
+      let context = this.canvasBuffer.getContext('2d');
+      context.clearRect(0, 0, this.canvasBuffer.width,
+          this.canvasBuffer.height);
+      context.drawImage(image, 0, 0,
+        this.minimapSize.width, this.minimapSize.height);
+      requestAnimationFrame(() => {
+        // Hide the old canvas and show the new buffer canvas.
+        d3.select(this.canvasBuffer).style('display', null);
+        d3.select(this.canvas).style('display', 'none');
+        // Swap the two canvases.
+        [this.canvas, this.canvasBuffer] = [this.canvasBuffer, this.canvas];
+      });
+      let downloadContext = this.downloadCanvas.getContext('2d');
+      downloadContext.clearRect(0, 0, this.downloadCanvas.width,
+        this.downloadCanvas.height);
+      downloadContext.drawImage(image, 0, 0,
+        this.downloadCanvas.width, this.downloadCanvas.height);
+    };
+    image.onerror = () => {
+      let blob = new Blob([svgXml], {type: 'image/svg+xml;charset=utf-8'});
+      image.src = URL.createObjectURL(blob);
+    };
+    image.src =
+        'data:image/svg+xml;charset=utf-8,' + encodeURIComponent(svgXml);
+  }
+
+  /**
+   * Handles changes in zooming/panning. Should be called from the main svg
+   * to notify that a zoom/pan was performed and this minimap will update it's
+   * viewpoint rectangle.
+   *
+   * @param translate The translate vector, or none to use the last used one.
+   * @param scale The scaling factor, or none to use the last used one.
+   */
+  zoom(transform?: d3.ZoomTransform): void {
+    if (this.scaleMinimap == null) {
+      // Scene is not ready yet.
+      return;
+    }
+    // Update the new translate and scale params, only if specified.
+    if (transform) {
+      this.translate = [transform.x, transform.y];
+      this.scaleMain = transform.k;
+    }
+
+    // Update the location of the viewpoint rectangle.
+    let svgRect = this.svg.getBoundingClientRect();
+    let $viewpoint = d3.select(this.viewpoint);
+    this.viewpointCoord.x = -this.translate[0] * this.scaleMinimap /
+        this.scaleMain;
+    this.viewpointCoord.y = -this.translate[1] * this.scaleMinimap /
+        this.scaleMain;
+    let viewpointWidth = svgRect.width * this.scaleMinimap / this.scaleMain;
+    let viewpointHeight = svgRect.height * this.scaleMinimap / this.scaleMain;
+    $viewpoint
+      .attr('x', this.viewpointCoord.x)
+      .attr('y', this.viewpointCoord.y)
+      .attr('width', viewpointWidth)
+      .attr('height', viewpointHeight);
+    // Show/hide the minimap depending on the viewpoint area as fraction of the
+    // whole minimap.
+    let mapWidth = this.minimapSize.width;
+    let mapHeight = this.minimapSize.height;
+    let x = this.viewpointCoord.x;
+    let y = this.viewpointCoord.y;
+    let w = Math.min(Math.max(0, x + viewpointWidth), mapWidth) -
+        Math.min(Math.max(0, x), mapWidth);
+    let h = Math.min(Math.max(0, y + viewpointHeight), mapHeight) -
+        Math.min(Math.max(0, y), mapHeight);
+    let fracIntersect = (w * h) / (mapWidth * mapHeight);
+    if (fracIntersect < FRAC_VIEWPOINT_AREA) {
+      this.minimap.classList.remove('hidden');
+    } else {
+      this.minimap.classList.add('hidden');
+    }
+  }
+}
+
+} // close module tf.scene
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/node.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/node.ts
new file mode 100644
index 0000000000000000000000000000000000000000..e66818f4c82c81866fe4eab17c8d0596cb86ed5d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/node.ts
@@ -0,0 +1,1072 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.graph.scene.node {
+  import RenderNodeInfo = tf.graph.render.RenderNodeInfo;
+  /**
+   * Select or Create a 'g.nodes' group to a given sceneGroup
+   * and builds a number of 'g.node' groups inside the group.
+   *
+   * Structure Pattern:
+   *
+   * <g class='nodes'>
+   *   <g class='node'>
+   *     <g class='in-annotations'>
+   *       ...
+   *     </g>
+   *     <g class='out-annotations'>
+   *       ...
+   *     </g>
+   *     <g class='nodeshape'>
+   *      <!--
+   *      Content of the node shape should be for the node itself. For example a
+   *      Metanode would have a <rect> with rounded edges, an op would have an
+   *      <ellipse>. More complex nodes like series may contain multiple
+   *      elements which are conditionally visible based on whether the node is
+   *      expanded.
+   *      -->
+   *     </g>
+   *     <text class='label'>node name</text>
+   *     <g class='subscene'>
+   *       <!--
+   *       Content of  the subscene (only for metanode and series node).
+   *
+   *       Subscene is a svg group that contains content of the
+   *       metanode's metagraph that is recursively generated by Scene.build().
+   *
+   *       When the graph is expanded multiple times, a subscene can contain
+   *       nested subscenes inside.
+   *       -->
+   *     </g>
+   *   </g>
+   *   ...
+   * </g>
+   *
+   *
+   * @param sceneGroup selection of the container
+   * @param nodeData array of render node information to map
+   * @param sceneElement <tf-graph-scene> polymer element
+   * @return selection of the created nodeGroups
+   */
+  export function buildGroup(
+      sceneGroup, nodeData: render.RenderNodeInfo[], sceneElement) {
+    let container =
+        scene.selectOrCreateChild(sceneGroup, 'g', Class.Node.CONTAINER);
+    // Select all children and join with data.
+    // (Note that all children of g.nodes are g.node)
+    let nodeGroups =
+        (container as any)
+            .selectAll('g')
+            .data(nodeData, (d) => {
+              // make sure that we don't have to swap shape type
+              return d.node.name + ':' + d.node.type;
+            });
+
+    // ENTER
+    nodeGroups.enter()
+        .append('g')
+        .attr('data-name', d => { return d.node.name; })
+        .each(function(d) {
+          let nodeGroup = d3.select(this);
+          // index node group for quick stylizing
+          sceneElement.addNodeGroup(d.node.name, nodeGroup);
+        })
+        .merge(nodeGroups)
+        // ENTER + UPDATE
+        .attr('class', d => { return Class.Node.GROUP + ' ' + nodeClass(d); })
+        .each(function(d) {
+          let nodeGroup = d3.select(this);
+          // Add g.in-annotations (always add -- to keep layer order
+          // consistent.)
+          let inAnnotationBox =
+              scene.selectOrCreateChild(nodeGroup, 'g', Class.Annotation.INBOX);
+          annotation.buildGroup(
+              inAnnotationBox, d.inAnnotations, d, sceneElement);
+
+          // Add g.out-annotations  (always add -- to keep layer order
+          // consistent.)
+          let outAnnotationBox = scene.selectOrCreateChild(
+              nodeGroup, 'g', Class.Annotation.OUTBOX);
+          annotation.buildGroup(
+              outAnnotationBox, d.outAnnotations, d, sceneElement);
+
+          // Build .shape first (background of the node).
+          let shape = buildShape(nodeGroup, d, Class.Node.SHAPE);
+          if (d.node.isGroupNode) {
+            addButton(shape, d, sceneElement);
+          }
+          addInteraction(shape, d, sceneElement);
+
+          // Build subscene on the top.
+          subsceneBuild(nodeGroup, <render.RenderGroupNodeInfo>d, sceneElement);
+
+          // Build label last. Should be on top of everything else.
+          let label = labelBuild(nodeGroup, d, sceneElement);
+          // Do not add interaction to metanode labels as they live inside the
+          // metanode shape which already has the same interactions.
+          addInteraction(label, d, sceneElement, d.node.type === NodeType.META);
+
+          stylize(nodeGroup, d, sceneElement);
+          position(nodeGroup, d);
+        });
+
+    // EXIT
+    nodeGroups.exit()
+        .each(function(d) {
+          // remove all indices on remove
+          sceneElement.removeNodeGroup(d.node.name);
+
+          let nodeGroup = d3.select(this);
+          if (d.inAnnotations.list.length > 0) {
+            nodeGroup.select('.' + Class.Annotation.INBOX)
+                .selectAll('.' + Class.Annotation.GROUP)
+                .each(a => { sceneElement.removeAnnotationGroup(a, d); });
+          }
+          if (d.outAnnotations.list.length > 0) {
+            nodeGroup.select('.' + Class.Annotation.OUTBOX)
+                .selectAll('.' + Class.Annotation.GROUP)
+                .each(a => { sceneElement.removeAnnotationGroup(a, d); });
+          }
+        })
+        .remove();
+    return nodeGroups;
+};
+
+/**
+ * Update or remove the subscene of a render group node depending on whether it
+ * is a expanded. If the node is not a group node, this method has no effect.
+ *
+ * @param nodeGroup selection of the container
+ * @param renderNodeInfo the render information for the node.
+ * @param sceneElement <tf-graph-scene> polymer element.
+ * @return Selection of the subscene group, or null if node group does not have
+ *        a subscene. Op nodes, bridge nodes and unexpanded group nodes will
+ *        not have a subscene.
+ */
+function subsceneBuild(nodeGroup,
+    renderNodeInfo: render.RenderGroupNodeInfo, sceneElement) {
+  if (renderNodeInfo.node.isGroupNode) {
+    if (renderNodeInfo.expanded) {
+      // Recursively build the subscene.
+      return scene.buildGroup(nodeGroup, renderNodeInfo, sceneElement,
+        Class.Subscene.GROUP);
+    }
+    // Clean out existing subscene if the node is not expanded.
+    scene.selectChild(nodeGroup, 'g', Class.Subscene.GROUP).remove();
+  }
+  return null;
+};
+
+/**
+ * Translate the subscene of the given node group
+ */
+function subscenePosition(nodeGroup, d: render.RenderNodeInfo) {
+  let x0 = d.x - d.width / 2.0 + d.paddingLeft;
+  let y0 = d.y - d.height / 2.0 + d.paddingTop;
+
+  let subscene = scene.selectChild(nodeGroup, 'g', Class.Subscene.GROUP);
+  scene.translate(subscene, x0, y0);
+};
+
+/**
+ * Add an expand/collapse button to a group node
+ *
+ * @param selection The group node selection.
+ * @param d Info about the node being rendered.
+ * @param sceneElement <tf-graph-scene> polymer element.
+ */
+function addButton(selection, d: render.RenderNodeInfo, sceneElement) {
+  let group =
+      scene.selectOrCreateChild(selection, 'g', Class.Node.BUTTON_CONTAINER);
+  scene.selectOrCreateChild(group, 'circle', Class.Node.BUTTON_CIRCLE);
+  scene.selectOrCreateChild(group, 'path', Class.Node.EXPAND_BUTTON)
+      .attr('d', 'M0,-2.2 V2.2 M-2.2,0 H2.2');
+  scene.selectOrCreateChild(group, 'path', Class.Node.COLLAPSE_BUTTON)
+      .attr('d', 'M-2.2,0 H2.2');
+  (group as any).on('click', (d: any) => {
+    // Stop this event's propagation so that it isn't also considered a
+    // node-select.
+    (<Event>d3.event).stopPropagation();
+    sceneElement.fire('node-toggle-expand', {name: d.node.name});
+  });
+  scene.positionButton(group, d);
+};
+
+/**
+ * Fire node-* events when the selection is interacted.
+ *
+ * @param disableInteraction When true, have the provided selection
+ * ignore all pointer events. Used for text labels inside of metanodes, which
+ * don't need interaction as their surrounding shape has interaction, and if
+ * given interaction would cause conflicts with the expand/collapse button.
+ */
+function addInteraction(selection, d: render.RenderNodeInfo,
+    sceneElement, disableInteraction?: boolean) {
+  if (disableInteraction) {
+    selection.attr('pointer-events', 'none');
+    return;
+  }
+
+  let contextMenuFunction = contextmenu.getMenu(
+    getContextMenu(d.node, sceneElement));
+  selection
+      .on('dblclick',
+          d => {
+            sceneElement.fire('node-toggle-expand', {name: d.node.name});
+          })
+      .on('mouseover',
+          d => {
+            // don't send mouseover over expanded group,
+            // otherwise it is causing too much glitches
+            if (sceneElement.isNodeExpanded(d)) {
+              return;
+            }
+
+            sceneElement.fire('node-highlight', {name: d.node.name});
+          })
+      .on('mouseout',
+          d => {
+            // don't send mouseover over expanded group,
+            // otherwise it is causing too much glitches
+            if (sceneElement.isNodeExpanded(d)) {
+              return;
+            }
+
+            sceneElement.fire('node-unhighlight', {name: d.node.name});
+          })
+      .on('click',
+          d => {
+            // Stop this event's propagation so that it isn't also considered
+            // a graph-select.
+            (<Event>d3.event).stopPropagation();
+            sceneElement.fire('node-select', {name: d.node.name});
+          })
+      .on('contextmenu', (d, i) => {
+        sceneElement.fire('node-select', {name: d.node.name});
+        contextMenuFunction.call(d, i);
+      });
+};
+
+/**
+ * Returns the d3 context menu specification for the provided node.
+ */
+export function getContextMenu(node: Node, sceneElement) {
+  let menu = [{
+    title: (d): string => {
+      return getIncludeNodeButtonString(node.include);
+    },
+    action: (elm, d, i) => {
+      sceneElement.fire('node-toggle-extract', {name: node.name});
+    }
+  }];
+  if (canBeInSeries(node)) {
+    menu.push({
+      title: d => { return getGroupSettingLabel(node); },
+      action: (elm, d, i) => {
+        sceneElement.fire(
+            'node-toggle-seriesgroup', {name: getSeriesName(node)});
+      }
+    });
+  }
+  return menu;
+}
+
+/** Returns if a node can be part of a grouped series */
+export function canBeInSeries(node: Node) {
+  return getSeriesName(node) !== null;
+}
+
+/**
+ * Returns the name of the possible grouped series containing this node.
+ * Returns null if the node cannot be part of a grouped series of nodes.
+ */
+export function getSeriesName(node: Node) {
+  if (!node) {
+    return null;
+  }
+  if (node.type === NodeType.SERIES) {
+    return node.name;
+  }
+  if (node.type === NodeType.OP) {
+    let op = <OpNode>node;
+    return op.owningSeries;
+  }
+  return null;
+}
+
+/**
+ * Returns the SeriesNode that represents the series that the provided node
+ * is contained in (or itself if the provided node is itself a SeriesNode).
+ * Returns null if the node is not rendered as part of a series.
+ */
+function getContainingSeries(node: Node) {
+  let s: SeriesNode = null;
+  if (!node) {
+    return null;
+  } else if (node.type === NodeType.SERIES) {
+    s = <SeriesNode>node;
+  } else if (node.parentNode && node.parentNode.type === NodeType.SERIES) {
+    s = <SeriesNode>node.parentNode;
+  }
+  return s;
+}
+
+/**
+ * Returns the label for a button to toggle the group setting of the provided
+ * node.
+ */
+export function getGroupSettingLabel(node: Node) {
+  return tf.graph.getGroupSeriesNodeButtonString(
+    getContainingSeries(node) !== null ? tf.graph.SeriesGroupingType.GROUP :
+     tf.graph.SeriesGroupingType.UNGROUP);
+}
+
+/**
+ * Append svg text for label and assign data.
+ * @param nodeGroup
+ * @param renderNodeInfo The render node information for the label.
+ * @param sceneElement <tf-graph-scene> polymer element.
+ */
+function labelBuild(nodeGroup, renderNodeInfo: render.RenderNodeInfo,
+    sceneElement) {
+  let namePath = renderNodeInfo.node.name.split('/');
+  let text = namePath[namePath.length - 1];
+
+  // Truncate long labels for unexpanded Metanodes.
+  let useFontScale = renderNodeInfo.node.type === NodeType.META &&
+    !renderNodeInfo.expanded;
+
+  let label = scene.selectOrCreateChild(nodeGroup, 'text', Class.Node.LABEL);
+
+  // Make sure the label is visually on top among its siblings.
+  let labelNode = <HTMLElement> label.node();
+  labelNode.parentNode.appendChild(labelNode);
+
+  label.attr('dy', '.35em').attr('text-anchor', 'middle');
+  if (useFontScale) {
+    if (text.length > sceneElement.maxMetanodeLabelLength) {
+      text = text.substr(0, sceneElement.maxMetanodeLabelLength - 2) + '...';
+    }
+    let scale = getLabelFontScale(sceneElement);
+    label.attr('font-size', scale(text.length) + 'px');
+  }
+
+  let txtElement = <d3.Selection<any, any, any, any>>label.text(text);
+  enforceLabelWidth(txtElement, renderNodeInfo.node.type, renderNodeInfo);
+  return label;
+}
+/**
+ * This function shortens text which would exceed the maximum pixel width of
+ * a label.
+ *
+ * @param txtElementSelection The text element containing the label's text as d3
+ * selection.
+ * @param nodeType The type of the node the label belongs to. If the node is
+ * an annotation, the value is -1. Label widths are defined in
+ * layout.PARAMS.nodeSize.{meta|op|...}.maxLabelWidth for nodes and
+ * layout.PARAMS.annotations.labelWidth for annotations.
+ * @param renderNodeInfo The render information about the node, required to
+ * determine whether META nodes are collapsed or expanded.
+ */
+export function enforceLabelWidth(
+    txtElementSelection: d3.Selection<any, any, any, any>, nodeType: NodeType | number,
+    renderNodeInfo?: render.RenderNodeInfo) {
+  // Get text element itself and its on-screen width.
+  let txtNode = <SVGTextElement>txtElementSelection.node();
+  let computedTxtLength = txtNode.getComputedTextLength();
+  let labelContent = txtNode.textContent;
+
+  // Get maximum length from settings.
+  let maxLength = null;
+  switch (nodeType) {
+    case NodeType.META:
+      if (renderNodeInfo && !renderNodeInfo.expanded) {  // Only trim text if
+        // node expanded.
+        maxLength = layout.PARAMS.nodeSize.meta.maxLabelWidth;
+      }
+      break;
+
+    case NodeType.OP:
+      maxLength = layout.PARAMS.nodeSize.op.maxLabelWidth;
+      break;
+
+    case -1:
+      maxLength = layout.PARAMS.annotations.maxLabelWidth;
+      break;
+
+    default:
+      break;
+  }
+
+  // Return if no max length provided for node type, or current label length is
+  // less than or equal to the provided length limit.
+  if (maxLength === null || computedTxtLength <= maxLength) {
+    return;
+  }
+
+  // Find the index of the character which exceeds the width.
+  // getSubStringLength performs far better than getComputedTextLength, and
+  // results in a 3x speed-up on average.
+  let index = 1;
+  while (txtNode.getSubStringLength(0, index) < maxLength) {
+    index++;
+  }
+
+  // Shorten the label starting at the string length known to be one
+  // character above max pixel length.
+  // When shortened the original label's substring is concatenated with
+  // '...', baseText contains the substring not including the '...'.
+  let baseText = <string>txtNode.textContent.substr(0, index);
+  do {
+    baseText = baseText.substr(0, baseText.length - 1);
+
+    // Recompute text length.
+    txtNode.textContent = baseText + '...';
+    computedTxtLength = txtNode.getComputedTextLength();
+  } while (computedTxtLength > maxLength && baseText.length > 0);
+
+  // Add tooltip with full name and return.
+  return txtElementSelection.append('title').text(labelContent);
+}
+
+/**
+ * d3 scale used for sizing font of labels, used by labelBuild,
+ * initialized once by getLabelFontScale.
+ */
+let fontScale = null;
+function getLabelFontScale(sceneElement) {
+  if (!fontScale) {
+    fontScale = d3.scaleLinear()
+      .domain([sceneElement.maxMetanodeLabelLengthLargeFont,
+        sceneElement.maxMetanodeLabelLength])
+      .range([sceneElement.maxMetanodeLabelLengthFontSize,
+        sceneElement.minMetanodeLabelLengthFontSize]).clamp(true);
+  }
+  return fontScale;
+}
+
+/**
+ * Set label position of a given node group
+ */
+function labelPosition(nodeGroup, cx: number, cy: number,
+    yOffset: number) {
+  scene.selectChild(nodeGroup, 'text', Class.Node.LABEL)
+      .transition()
+      .attr('x', cx)
+      .attr('y', cy + yOffset);
+};
+
+/**
+ * Select or append/insert shape for a node and assign renderNode
+ * as the shape's data.
+ *
+ * @param nodeGroup
+ * @param d Render node information.
+ * @param nodeClass class for the element.
+ * @return Selection of the shape.
+ */
+export function buildShape(nodeGroup, d, nodeClass: string) {
+  // Create a group to house the underlying visual elements.
+  let shapeGroup = scene.selectOrCreateChild(nodeGroup, 'g', nodeClass);
+  // TODO(jimbo): DOM structure should be templated in HTML somewhere, not JS.
+  switch (d.node.type) {
+    case NodeType.OP:
+      scene.selectOrCreateChild(shapeGroup, 'ellipse', Class.Node.COLOR_TARGET);
+      break;
+    case NodeType.SERIES:
+      // Choose the correct stamp to use to represent this series.
+      let stampType = 'annotation';
+      let groupNodeInfo = <render.RenderGroupNodeInfo>d;
+      if (groupNodeInfo.coreGraph) {
+        stampType =
+            groupNodeInfo.node.hasNonControlEdges ? 'vertical' : 'horizontal';
+      }
+      let classList = [Class.Node.COLOR_TARGET];
+      if (groupNodeInfo.isFadedOut) {
+        classList.push('faded-ellipse');
+      }
+      scene.selectOrCreateChild(shapeGroup, 'use', classList)
+          .attr('xlink:href', '#op-series-' + stampType + '-stamp');
+      scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
+          .attr('rx', d.radius).attr('ry', d.radius);
+      break;
+    case NodeType.BRIDGE:
+      scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
+          .attr('rx', d.radius).attr('ry', d.radius);
+      break;
+    case NodeType.META:
+      scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
+          .attr('rx', d.radius).attr('ry', d.radius);
+      break;
+    default:
+      throw Error('Unrecognized node type: ' + d.node.type);
+  }
+  return shapeGroup;
+};
+
+export function nodeClass(d: render.RenderNodeInfo) {
+  switch (d.node.type) {
+    case NodeType.OP:
+      return Class.OPNODE;
+    case NodeType.META:
+      return Class.METANODE;
+    case NodeType.SERIES:
+      return Class.SERIESNODE;
+    case NodeType.BRIDGE:
+      return Class.BRIDGENODE;
+    case NodeType.ELLIPSIS:
+      return Class.ELLIPSISNODE;
+  };
+  throw Error('Unrecognized node type: ' + d.node.type);
+};
+
+/** Modify node and its subscene and its label's positional attributes */
+function position(nodeGroup, d: render.RenderNodeInfo) {
+  let shapeGroup = scene.selectChild(nodeGroup, 'g', Class.Node.SHAPE);
+  let cx = layout.computeCXPositionOfNodeShape(d);
+  switch (d.node.type) {
+    case NodeType.OP: {
+      // position shape
+      let shape = scene.selectChild(shapeGroup, 'ellipse');
+      scene.positionEllipse(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
+      labelPosition(nodeGroup, cx, d.y, d.labelOffset);
+      break;
+    }
+    case NodeType.META: {
+      // position shape
+      let shape = scene.selectChild(shapeGroup, 'rect');
+      if (d.expanded) {
+        scene.positionRect(shape, d.x, d.y, d.width, d.height);
+        subscenePosition(nodeGroup, d);
+        // put label on top
+        labelPosition(nodeGroup, cx, d.y,
+          - d.height / 2 + d.labelHeight / 2);
+      } else {
+        scene.positionRect(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
+        labelPosition(nodeGroup, cx, d.y, 0);
+      }
+      break;
+    }
+    case NodeType.SERIES: {
+      let shape = scene.selectChild(shapeGroup, 'use');
+      if (d.expanded) {
+        scene.positionRect(shape, d.x, d.y, d.width, d.height);
+        subscenePosition(nodeGroup, d);
+        // put label on top
+        labelPosition(nodeGroup, cx, d.y,
+          - d.height / 2 + d.labelHeight / 2);
+      } else {
+        scene.positionRect(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
+        labelPosition(nodeGroup, cx, d.y, d.labelOffset);
+      }
+      break;
+    }
+    case NodeType.BRIDGE: {
+      // position shape
+      // NOTE: In reality, these will not be visible, but it helps to put them
+      // in the correct position for debugging purposes.
+      let shape = scene.selectChild(shapeGroup, 'rect');
+      scene.positionRect(shape, d.x, d.y, d.width, d.height);
+      break;
+    }
+    default: { throw Error('Unrecognized node type: ' + d.node.type); }
+  }
+};
+
+/** Enum specifying the options to color nodes by */
+export enum ColorBy {STRUCTURE, DEVICE, XLA_CLUSTER, COMPUTE_TIME, MEMORY}
+;
+
+/**
+ * Returns the fill color for the node given its state and the 'color by'
+ * option.
+ */
+export function getFillForNode(templateIndex, colorBy,
+    renderInfo: render.RenderNodeInfo, isExpanded: boolean): string {
+  let colorParams = render.MetanodeColors;
+  switch (colorBy) {
+    case ColorBy.STRUCTURE:
+      if (renderInfo.node.type === NodeType.META) {
+        let tid = (<Metanode>renderInfo.node).templateId;
+        return tid === null ?
+          colorParams.UNKNOWN :
+          colorParams.STRUCTURE_PALETTE(templateIndex(tid), isExpanded);
+      } else if (renderInfo.node.type === NodeType.SERIES) {
+        // If expanded, we're showing the background rect, which we want to
+        // appear gray. Otherwise we're showing a stack of ellipses which we
+        // want to show white.
+        return isExpanded ? colorParams.EXPANDED_COLOR : 'white';
+      } else if (renderInfo.node.type === NodeType.BRIDGE) {
+        return renderInfo.structural ?
+            '#f0e' :
+            (<BridgeNode>renderInfo.node).inbound ? '#0ef' : '#fe0';
+      } else {
+        // Op nodes are white.
+        return 'white';
+      }
+    case ColorBy.DEVICE:
+      if (renderInfo.deviceColors == null) {
+        // Return the hue for unknown device.
+        return colorParams.UNKNOWN;
+      }
+      let id = renderInfo.node.name;
+      let escapedId = tf.graph.util.escapeQuerySelector(id);
+      let gradientDefs = d3.select('svg#svg defs #linearGradients');
+      let linearGradient = gradientDefs.select('linearGradient#' + escapedId);
+      // If the linear gradient is not there yet, create it.
+      if (linearGradient.size() === 0) {
+        linearGradient = gradientDefs.append('linearGradient').attr('id', id);
+        // Re-create the stops of the linear gradient.
+        linearGradient.selectAll('*').remove();
+        let cumulativeProportion = 0;
+        // For each device, create a stop using the proportion of that device.
+        _.each(renderInfo.deviceColors, d => {
+          let color = d.color;
+          linearGradient.append('stop')
+              .attr('offset', cumulativeProportion)
+              .attr('stop-color', color);
+          linearGradient.append('stop')
+              .attr('offset', cumulativeProportion + d.proportion)
+              .attr('stop-color', color);
+          cumulativeProportion += d.proportion;
+        });
+      }
+      return isExpanded ? colorParams.EXPANDED_COLOR : `url(#${escapedId})`;
+    case ColorBy.XLA_CLUSTER:
+      return isExpanded ? colorParams.EXPANDED_COLOR :
+                          renderInfo.xlaClusterColor || colorParams.UNKNOWN;
+    case ColorBy.COMPUTE_TIME:
+      return isExpanded ?
+        colorParams.EXPANDED_COLOR : renderInfo.computeTimeColor ||
+        colorParams.UNKNOWN;
+    case ColorBy.MEMORY:
+      return isExpanded ?
+        colorParams.EXPANDED_COLOR : renderInfo.memoryColor ||
+        colorParams.UNKNOWN;
+    default:
+      throw new Error('Unknown case to color nodes by');
+  }
+}
+
+/**
+ * Modify node style by toggling class and assign attributes (only for things
+ * that can't be done in css).
+ */
+export function stylize(nodeGroup, renderInfo: render.RenderNodeInfo,
+    sceneElement, nodeClass?) {
+  nodeClass = nodeClass || Class.Node.SHAPE;
+  let isHighlighted = sceneElement.isNodeHighlighted(renderInfo.node.name);
+  let isSelected = sceneElement.isNodeSelected(renderInfo.node.name);
+  let isExtract = renderInfo.isInExtract || renderInfo.isOutExtract;
+  let isExpanded = renderInfo.expanded;
+  let isFadedOut = renderInfo.isFadedOut;
+  nodeGroup.classed('highlighted', isHighlighted);
+  nodeGroup.classed('selected', isSelected);
+  nodeGroup.classed('extract', isExtract);
+  nodeGroup.classed('expanded', isExpanded);
+  nodeGroup.classed('faded', isFadedOut);
+
+  // Main node always exists here and it will be reached before subscene,
+  // so d3 selection is fine here.
+  let node = nodeGroup.select('.' + nodeClass + ' .' + Class.Node.COLOR_TARGET);
+  let fillColor = getFillForNode(sceneElement.templateIndex,
+    ColorBy[sceneElement.colorBy.toUpperCase()],
+    renderInfo, isExpanded);
+  node.style('fill', fillColor);
+
+  // Choose outline to be darker version of node color if the node is a single
+  // color and is not selected.
+  node.style('stroke', isSelected ? null : getStrokeForFill(fillColor));
+};
+
+/**
+ * Given a node's fill color/gradient, determine the stroke for the node.
+ */
+export function getStrokeForFill(fill: string) {
+  // If node is colored by a gradient, then use a dark gray outline.
+  return fill.substring(0, 3) === 'url' ?
+      render.MetanodeColors.GRADIENT_OUTLINE :
+      d3.rgb(fill).darker().toString();
+}
+
+/**
+ * Finds selected node and highlights all nodes which are providing direct
+ * or indirect input to the node and all edges connecting these nodes
+ * together and to the selected node.
+ *
+ * @param renderGraphInfo Information on the rendered state of the graph.
+ */
+export function traceInputs(renderGraphInfo: tf.graph.render.RenderGraphInfo) {
+  // Reset all styling.
+  d3.selectAll('.input-highlight').classed('input-highlight', false);
+  d3.selectAll('.non-input').classed('non-input', false);
+  d3.selectAll('.input-parent').classed('input-parent', false);
+  d3.selectAll('.input-child').classed('input-child', false);
+  d3.selectAll('.input-edge-highlight').classed('input-edge-highlight', false);
+  d3.selectAll('.non-input-edge-highlight')
+      .classed('non-input-edge-highlight', false);
+  d3.selectAll('.input-highlight-selected')
+      .classed('input-highlight-selected', false);
+
+  // Extract currently selected node. Return if input tracing disabled or no
+  // node is selected.
+  let selectedNodeSelectorString = 'g.node.selected,g.op.selected';
+  let node = d3.select(selectedNodeSelectorString);
+  let currentNode = undefined;
+  if (renderGraphInfo && renderGraphInfo.traceInputs && node && node[0] &&
+      node[0][0]) {
+    currentNode = node[0][0] as Element;
+  } else {
+    return;
+  }
+  let nodeName = currentNode.getAttribute('data-name');
+  let opNodes = _getAllContainedOpNodes(nodeName, renderGraphInfo);
+  let allTracedNodes = {};
+  _.each(opNodes, function(nodeInstance) {
+    allTracedNodes =
+        traceAllInputsOfOpNode(renderGraphInfo, nodeInstance, allTracedNodes);
+  });
+
+  d3.selectAll(selectedNodeSelectorString)
+    // Remove the input-highlight from the selected node.
+    .classed('input-highlight', false)
+    // Add input-highlight-selected class to selected node, which allows
+    // treating the selected not as a special case of an input node.
+    .classed('input-highlight-selected', true)
+
+  // Highlight all parent nodes of each OpNode as input parent to allow
+  // specific highlighting.
+  let highlightedNodes = Object.keys(allTracedNodes);
+  let visibleNodes =
+      _findVisibleParentsFromOpNodes(renderGraphInfo, highlightedNodes);
+  _markParentsOfNodes(visibleNodes);
+
+  // Attach class to all non-input nodes and edges for styling.
+  d3.selectAll(
+        'g.node:not(.selected):not(.input-highlight)' +
+        ':not(.input-parent):not(.input-children)')
+      .classed('non-input', true)
+      .each(function(d: RenderNodeInfo) {
+        // Mark all nodes with the specified name as non-inputs. This
+        // results in Annotation nodes which are attached to inputs to be
+        // tagged as well.
+        let nodeName = d.node.name;
+        d3.selectAll(`[data-name="${nodeName}"]`).classed('non-input', true);
+      });
+  d3.selectAll('g.edge:not(.input-edge-highlight)')
+      .classed('non-input-edge-highlight', true);
+}
+
+/**
+ * Recursively find all op nodes contained by the node identified by the
+ * provided name.
+ * @param nodeName The meta or op node of which the OpNode instances are
+ * required.
+ * @param renderGraphInfo The rendered graph information object.
+ * @returns {Array} An array of OpNodeImpl instances.
+ */
+export function _getAllContainedOpNodes(
+    nodeName: string, renderGraphInfo: tf.graph.render.RenderGraphInfo) {
+  let opNodes = [];
+
+  // Get current node.
+  let node = renderGraphInfo.getNodeByName(nodeName) as tf.graph.GroupNode |
+      tf.graph.OpNode;
+
+  // If node is already OpNode then return the node plus its input embeddings.
+  if (node instanceof tf.graph.OpNodeImpl) {
+    return [node].concat(node.inEmbeddings);
+  }
+
+  // Otherwise, make recursive call for each node contained by the GroupNode.
+  let childNodeNames = (node as tf.graph.GroupNode).metagraph.nodes();
+  _.each(childNodeNames, function(childNodeName) {
+    opNodes =
+        opNodes.concat(_getAllContainedOpNodes(childNodeName, renderGraphInfo));
+  });
+
+  return opNodes;
+}
+
+/**
+ * When resolving inputs of a node the visible parent node of each input
+ * node (i.e. the first parent which is rendered to the screen) needs to be
+ * found, and since such a node may contain several input OpNodes a map
+ * of the visible parent to all the input OpNodes it contains is provided by
+ * opNodes.
+ */
+interface VisibleParent {
+  visibleParent: Node;
+  opNodes: OpNode[];
+}
+
+export function traceAllInputsOfOpNode(
+    renderGraphInfo: tf.graph.render.RenderGraphInfo, startNode: OpNode,
+    allTracedNodes: Object) {
+  // To prevent infinite loops due to cyclical relationships and improving
+  // performance by tracing OpNode which is input to 2+ nodes only once.
+  if (allTracedNodes[startNode.name]) {
+    return allTracedNodes;
+  } else {
+    allTracedNodes[startNode.name] = true;
+  }
+  // Extract the inputs.
+  let inputs = startNode.inputs;
+  // Get visible parent.
+  let currentVisibleParent = getVisibleParent(renderGraphInfo, startNode);
+  // Mark as input node.
+  d3.select(`.node[data-name="${currentVisibleParent.name}"]`)
+      .classed('input-highlight', true);
+
+  // Find the visible parent of each input.
+  let visibleInputs = {};
+  _.each(inputs, function(nodeInstance) {
+    let resolvedNode = renderGraphInfo.getNodeByName(nodeInstance.name);
+    if (resolvedNode === undefined) {
+      // Node could not be found in rendered Hierarchy, which happens when
+      // tracing inputs of a SummaryNode.
+      return;
+    }
+    // Ensure node is resolved to OpNode if name collision with Metanode exists.
+    if (resolvedNode instanceof MetanodeImpl) {
+      let resolvedNodeName = tf.graph.getStrictName(resolvedNode.name);
+      resolvedNode = renderGraphInfo.getNodeByName(resolvedNodeName) as OpNode;
+    }
+
+    let visibleParent = getVisibleParent(renderGraphInfo, resolvedNode);
+
+    // Append OpNode to visible parent entry.
+    let visibleInputsEntry = visibleInputs[visibleParent.name];
+    if (visibleInputsEntry) {
+      visibleInputsEntry.opNodes.push(resolvedNode);
+    } else {  // Create new entry.
+      visibleInputs[visibleParent.name] = {
+        visibleParent: visibleParent,
+        opNodes: [resolvedNode]
+      } as VisibleParent;
+    }
+  });
+
+  // Find all parents of the start node.
+  let startNodeParents = {};
+  let indexedStartNodeParents = [currentVisibleParent];
+  startNodeParents[currentVisibleParent.name] = {
+    traced: false,
+    index: 0,
+    connectionEndpoints: []
+  };
+
+  let currentNode = currentVisibleParent as Node;
+  for (let index = 1; currentNode.name !== tf.graph.ROOT_NAME; index++) {
+    currentNode = currentNode.parentNode;
+    startNodeParents[currentNode.name] = {
+      traced: false,
+      index: index,
+      connectionEndpoints: []
+    };
+    indexedStartNodeParents[index] = currentNode;
+  }
+
+  // Find first mutual parent of each input node and highlight connection.
+  _.forOwn(visibleInputs, function(visibleParentInfo: VisibleParent, key) {
+    let nodeInstance = visibleParentInfo.visibleParent;
+    // Make recursive call for each input-OpNode contained by the visible
+    // parent.
+    _.each(visibleParentInfo.opNodes, function(opNode: OpNode) {
+      allTracedNodes =
+          traceAllInputsOfOpNode(renderGraphInfo, opNode, allTracedNodes);
+    });
+
+    if (nodeInstance.name !== currentVisibleParent.name) {
+      _createVisibleTrace(
+          nodeInstance, startNodeParents, indexedStartNodeParents);
+    }
+  });
+
+  return allTracedNodes;
+}
+
+/**
+ * Colors the edges to connect the passed node to the start node. This is
+ * done by:
+ *
+ * a) Finding the first (visible) common parent in the rendered
+ * hierarchy.
+ * NB: There are 2 types of connections:
+ * 1) Direct connections between node A
+ * and B, marked below as II,
+ * 2) Connections from any node A to its parent, A'. Marked below as I and III.
+ * For type 2 connection you need to know the inner-nested node, the
+ * direct parent, and the ultimate destination of the connection.
+ *
+ *  A_parent      B_parent
+ * +--------+    +---------+
+ * |        |    |         |
+ * |  +--+ I| II |III+--+  |
+ * |  |A +---------->+B |  |
+ * |  +--+  |    |   +--+  |
+ * |        |    |         |
+ * +--------+    +---------+
+ *
+ *
+ * b) Highlighting the direct connection between the parents of A and B,
+ * called A_parent and B_parent, s.t. A_parent and B_parent are children of the
+ * mutual parent of A and B found in a), marked above as II.
+ *
+ * c) Highlighting the connection from A to A_parent and B to B_parent
+ * (through all layers of parents between A and A_parent and B and B_parent,
+ * respectively). Marked above as I and III.
+ *
+ * @param nodeInstance The instance of the node to use as destination node, B.
+ * @param startNodeParents Map of startNodeParent names to information objects
+ * about the parent.
+ * @param indexedStartNodeParents An array of all parents of the start node.
+ * This is required to find the child of the mutual parent which is a parent
+ * of the start node.
+ * @private
+ */
+function _createVisibleTrace(
+    nodeInstance: Node, startNodeParents, indexedStartNodeParents: Node[]) {
+  let currentNode = nodeInstance;
+  let previousNode = nodeInstance;
+
+  // Ascend through parents until a mutual parent is found with the start
+  // node.
+  let destinationParentPairs = [];
+  while (!startNodeParents[currentNode.name]) {
+    if (previousNode.name !== currentNode.name) {
+      destinationParentPairs.push([previousNode, currentNode]);
+    }
+    previousNode = currentNode;
+    currentNode = currentNode.parentNode;
+  }
+
+  // Connection between nodes is drawn between the parents of each
+  // respective node, both of which share the mutual parent.
+  let startNodeIndex = startNodeParents[currentNode.name].index;
+  let startNodeName =
+      indexedStartNodeParents[Math.max(startNodeIndex - 1, 0)].name;
+
+  let startNodeTopParentName = startNodeName;
+  let targetNodeTopParentName = previousNode.name;
+
+  let endNodeName = previousNode.name;
+  d3.selectAll(`[data-edge="${endNodeName}--${startNodeName}"]`)
+      .classed('input-edge-highlight', true);
+
+  // Trace up the parents of the input.
+  _.each(destinationParentPairs, function(value) {
+    let inner = value[0];
+    let outer = value[1];
+    let edgeSelector = `[data-edge="${inner.name}--${startNodeTopParentName}` +
+        `~~${outer.name}~~OUT"]`;
+    d3.selectAll(edgeSelector).classed('input-edge-highlight', true);
+  });
+
+  // Trace up the parents of the start node.
+  for (let index = 1; index < startNodeIndex; index++) {
+    let inner = indexedStartNodeParents[index - 1];
+    let outer = indexedStartNodeParents[index];
+    let edgeSelector = `[data-edge="${targetNodeTopParentName}~~${outer.name}` +
+        `~~IN--${inner.name}"]`;
+    d3.selectAll(edgeSelector).classed('input-edge-highlight', true);
+  }
+}
+
+/**
+ * Creates map { [name: string] -> Node } of all visible / rendered parents
+ * of the nodes identified by the node names passed in.
+ *
+ * @param renderGraphInfo The information on the rendered graph.
+ * @param nodeNames String array of node names.
+ * @returns {[nodeName: string]: Node}
+ * @private
+ */
+function _findVisibleParentsFromOpNodes(renderGraphInfo, nodeNames: string[]) {
+  let visibleParents: {[nodeName: string]: Node} = {};
+  _.each(nodeNames, function(nodeName) {
+    let currentNode = renderGraphInfo.getNodeByName(nodeName);
+    let visibleParent = getVisibleParent(renderGraphInfo, currentNode);
+    visibleParents[visibleParent.name] = visibleParent;
+  });
+
+  return visibleParents;
+}
+
+/**
+ * Traverse through the parents of all nodes in the list and mark each
+ * encountered node as input-parent.
+ * @param visibleNodes Map of input nodes, have to be visible/rendered when
+ * called.
+ * @private
+ */
+function _markParentsOfNodes(visibleNodes: {[nodeName: string]: Node}) {
+  _.forOwn(visibleNodes, function(nodeInstance: Node) {
+    // Mark all parents of the node as input-parents.
+    let currentNode = nodeInstance;
+
+    while (currentNode.name !== tf.graph.ROOT_NAME) {
+      let renderedElement = d3.select(`.node[data-name="${currentNode.name}"]`);
+      // Only mark the element as a parent node to an input if it is not
+      // marked as input node itself.
+      if (renderedElement[0][0] &&
+          !renderedElement.classed('input-highlight') &&
+          !renderedElement.classed('selected') &&
+          // OpNode only parent if start node is embedded node, in which case
+          // the OpNode should be faded as well.
+          !renderedElement.classed('op')) {
+        renderedElement.classed('input-parent', true);
+      }
+      currentNode = currentNode.parentNode;
+    }
+  });
+}
+
+/**
+ * Find the parent of the passed in op node which is expanded. This is done
+ * by going through all parents until the parent's parent is expanded, thus
+ * finding the first unexpanded parent which is rendered on the screen.
+ * @param renderGraphInfo The graph info object used to gain access to the
+ * render info of the parents.
+ * @param currentNode The node whose parent is to be found.
+ * @returns Node
+ */
+export function getVisibleParent(
+    renderGraphInfo: tf.graph.render.RenderGraphInfo,
+    currentNode: tf.graph.Node) {
+  let found = false;
+  let currentParent = currentNode;
+
+  while (!found) {
+    // Get parent element, to extract name.
+    currentNode = currentParent;
+    currentParent = currentNode.parentNode;
+
+    if (currentParent === undefined) {
+      found = true;
+    } else {
+      let renderNode = renderGraphInfo.getRenderNodeByName(currentParent.name);
+      // Found if node is rendered on the screen (renderNode truthy), and
+      // the parent is either expanded (i.e. it is a metanode or seriesnode)
+      // or the parent is an OpNode in which case currentNode is an embedded
+      // node which has another OpNode as parent.
+      if (renderNode &&
+          (renderNode.expanded || currentParent instanceof graph.OpNodeImpl)) {
+        found = true;
+      }
+    }
+  }  // Close while loop.
+  return currentNode;
+}
+}  // Close module.
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/parser.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/parser.ts
new file mode 100644
index 0000000000000000000000000000000000000000..04d879ef9108fafbf1e99bd43ac868bea11860f2
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/parser.ts
@@ -0,0 +1,284 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.graph.parser {
+
+/**
+ * Parses a native js value, which can be either a string, boolean or number.
+ *
+ * @param value The value to be parsed.
+ */
+function parseValue(value: string): string|number|boolean {
+  if (value === 'true') {
+    return true;
+  }
+  if (value === 'false') {
+    return false;
+  }
+  let firstChar = value[0];
+  if (firstChar === '"') {
+    return value.substring(1, value.length - 1);
+  }
+  let num = parseFloat(value);
+  return isNaN(num) ? value : num;
+}
+
+/**
+ * Fetches a text file and returns a promise of the result.
+ */
+export function fetchPbTxt(filepath: string): Promise<ArrayBuffer> {
+  return new Promise<ArrayBuffer>(function(resolve, reject) {
+    const request = new XMLHttpRequest();
+    request.open('GET', filepath);
+    request.responseType = 'arraybuffer';
+
+    request.onerror = () => reject(request.status);
+    request.onload = () => resolve(request.response);
+
+    request.send(null);
+  });
+}
+
+/**
+ * Fetches the metadata file, parses it and returns a promise of the result.
+ */
+export function fetchAndParseMetadata(path: string, tracker: ProgressTracker) {
+  return tf.graph.util
+      .runTask(
+          'Reading metadata pbtxt', 40,
+          () => {
+            if (path == null) {
+              return Promise.resolve(null);
+            }
+            return fetchPbTxt(path);
+          },
+          tracker)
+      .then((arrayBuffer: ArrayBuffer) => {
+        return tf.graph.util.runAsyncPromiseTask(
+            'Parsing metadata.pbtxt', 60, () => {
+              return arrayBuffer != null ? parseStatsPbTxt(arrayBuffer) :
+                                           Promise.resolve(null);
+            }, tracker);
+      });
+}
+
+/**
+ * Fetches the graph file, parses it and returns a promise of the result. The
+ * result will be undefined if the graph is empty.
+ */
+export function fetchAndParseGraphData(path: string, pbTxtFile: Blob,
+    tracker: ProgressTracker) {
+  return tf.graph.util
+      .runTask(
+          'Reading graph pbtxt', 40,
+          () => {
+            if (pbTxtFile) {
+              return new Promise<ArrayBuffer>(function(resolve, reject) {
+                let fileReader = new FileReader();
+                fileReader.onload = () => resolve(fileReader.result);
+                fileReader.onerror = () => reject(fileReader.error);
+                fileReader.readAsArrayBuffer(pbTxtFile);
+              });
+            } else {
+              return fetchPbTxt(path);
+            }
+          },
+          tracker)
+      .then((arrayBuffer: ArrayBuffer) => {
+        return tf.graph.util.runTask('Parsing graph.pbtxt', 60, () => {
+          return parseGraphPbTxt(arrayBuffer);
+        }, tracker);
+      });
+}
+
+/**
+ * Parse a file object in a streaming fashion line by line (or custom delim).
+ * Can handle very large files.
+ * @param input The file object as an array buffer.
+ * @param callback The callback called on each line
+ * @param chunkSize The size of each read chunk. (optional)
+ * @param delim The delimiter used to split a line. (optional)
+ * @returns A promise for when it is finished.
+ */
+export function streamParse(
+    arrayBuffer: ArrayBuffer, callback: (string) => void,
+    chunkSize: number = 1000000, delim: string = '\n'): Promise<boolean> {
+  return new Promise<boolean>(function(resolve, reject) {
+    let offset = 0;
+    let bufferSize = arrayBuffer.byteLength - 1;
+    let data = '';
+
+    function readHandler(str) {
+      offset += chunkSize;
+      let parts = str.split(delim);
+      let first = data + parts[0];
+      if (parts.length === 1) {
+        data = first;
+        readChunk(offset, chunkSize);
+        return;
+      }
+      data = parts[parts.length - 1];
+      callback(first);
+      for (let i = 1; i < parts.length - 1; i++) {
+        callback(parts[i]);
+      }
+      if (offset >= bufferSize) {
+        if (data) {
+          callback(data);
+        }
+        resolve(true);
+        return;
+      }
+      readChunk(offset, chunkSize);
+    }
+
+    function readChunk(offset: number, size: number) {
+      const arrayBufferChunk = arrayBuffer.slice(offset, offset + size);
+
+      const blob = new Blob([arrayBufferChunk]);
+      const file = new FileReader();
+      file.onload = (e: any) => readHandler(e.target.result);
+      file.readAsText(blob);
+    }
+
+    readChunk(offset, chunkSize);
+  });
+}
+
+/**
+ * Since proto-txt doesn't explicitly say whether an attribute is repeated
+ * (an array) or not, we keep a hard-coded list of attributes that are known
+ * to be repeated. This list is used in parsing time to convert repeated
+ * attributes into arrays even when the attribute only shows up once in the
+ * object.
+ */
+const GRAPH_REPEATED_FIELDS: {[attrPath: string]: boolean} = {
+  'node': true,
+  'node.input': true,
+  'node.attr': true,
+  'node.attr.value.list.type': true,
+  'node.attr.value.shape.dim': true,
+  'node.attr.value.tensor.string_val': true,
+  'node.attr.value.tensor.tensor_shape.dim': true,
+  'node.attr.value.list.shape': true,
+  'node.attr.value.list.shape.dim': true,
+  'node.attr.value.list.s': true
+};
+
+const METADATA_REPEATED_FIELDS: {[attrPath: string]: boolean} = {
+  'step_stats.dev_stats': true,
+  'step_stats.dev_stats.node_stats': true,
+  'step_stats.dev_stats.node_stats.output': true,
+  'step_stats.dev_stats.node_stats.memory': true,
+  'step_stats.dev_stats.node_stats.output.tensor_description.shape.dim': true
+};
+
+/**
+ * Parses an ArrayBuffer of a proto txt file into a raw Graph object.
+ */
+export function parseGraphPbTxt(input: ArrayBuffer):
+    Promise<tf.graph.proto.NodeDef[]> {
+  return parsePbtxtFile(input, GRAPH_REPEATED_FIELDS).then(obj => obj['node']);
+}
+
+/**
+ * Parses an ArrayBuffer of a proto txt file into a StepStats object.
+ */
+export function parseStatsPbTxt(input: ArrayBuffer):
+    Promise<tf.graph.proto.StepStats> {
+  return parsePbtxtFile(input, METADATA_REPEATED_FIELDS)
+      .then(obj => obj['step_stats']);
+}
+
+/**
+ * Parses a ArrayBuffer of a proto txt file into javascript object.
+ *
+ * @param input The ArrayBuffer or file object implementing slice.
+ * @param repeatedFields Map (Set) of all the repeated fields, since you can't
+ *   tell directly from the pbtxt if a field is repeated or not.
+ * @returns The parsed object.
+ */
+function parsePbtxtFile(
+    input: ArrayBuffer,
+    repeatedFields: {[attrPath: string]: boolean}): Promise<Object> {
+  let output: { [name: string]: any; } = {};
+  let stack = [];
+  let path: string[] = [];
+  let current: { [name: string]: any; } = output;
+
+  function splitNameAndValueInAttribute(line: string) {
+    let colonIndex = line.indexOf(':');
+    let name = line.substring(0, colonIndex).trim();
+    let value = parseValue(line.substring(colonIndex + 2).trim());
+    return {
+      name: name,
+      value: value
+    };
+  }
+
+  /**
+   * Adds a value, given the attribute name and the host object. If the
+   * attribute already exists, but is not an array, it will convert it to an
+   * array of values.
+   *
+   * @param obj The host object that holds the attribute.
+   * @param name The attribute name (key).
+   * @param value The attribute value.
+   * @param path A path that identifies the attribute. Used to check if
+   *     an attribute is an array or not.
+   */
+  function addAttribute(obj: Object, name: string,
+      value: Object|string|number|boolean, path: string[]): void {
+    // We treat 'node' specially since it is done so often.
+    let existingValue = obj[name];
+    if (existingValue == null) {
+      obj[name] = path.join('.') in repeatedFields ? [value] : value;
+    } else if (Array.isArray(existingValue)) {
+      existingValue.push(value);
+    } else {
+      obj[name] = [existingValue, value];
+    }
+  }
+
+  // Run through the file a line at a time.
+  return streamParse(input, function(line: string) {
+    if (!line) {
+      return;
+    }
+    line = line.trim();
+
+    switch (line[line.length - 1]) {
+      case '{':  // create new object
+        let name = line.substring(0, line.length - 2).trim();
+        let newValue: { [name: string]: any; } = {};
+        stack.push(current);
+        path.push(name);
+        addAttribute(current, name, newValue, path);
+        current = newValue;
+        break;
+      case '}':
+        current = stack.pop();
+        path.pop();
+        break;
+      default:
+        let x = splitNameAndValueInAttribute(line);
+        addAttribute(current, x.name, x.value, path.concat(x.name));
+        break;
+    }
+  }).then(function() {
+    return output;
+  });
+}
+
+} // Close module tf.graph.parser.
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/proto.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/proto.ts
new file mode 100644
index 0000000000000000000000000000000000000000..eda73e45c3b27f77d5fc5790f57fd97ae3518382
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/proto.ts
@@ -0,0 +1,143 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * @fileoverview Interfaces that parallel proto definitions in
+ * third_party/tensorflow/core/framework/...
+ *     graph.proto
+ *     step_stats.proto
+ * These should stay in sync.
+ */
+module tf.graph.proto {
+  /**
+   * TensorFlow node definition as defined in the graph.proto file.
+   */
+  export interface NodeDef {
+    /** Name of the node */
+    name: string;
+    /** List of nodes that are inputs for this node. */
+    input: string[];
+    /** The name of the device where the computation will run. */
+    device: string;
+    /** The name of the operation associated with this node. */
+    op: string;
+    /** List of attributes that describe/modify the operation. */
+    attr: {key: string, value: Object}[];
+  }
+
+  /**
+   * Generic graph as defined in the graph_explorer.proto file.
+   */
+  export interface GenericGraph {
+    /** List of nodes in the graph */
+    node: GenericNode[];
+    /** List of nodes in the graph */
+    edge: GenericEdge[];
+    /** List of attributes that describe/modify the operation. */
+    attr: Array<{[key: string]: any}>;
+  }
+
+  /**
+   * GenericEdge corresponds to the Edge message in graph_explorer.proto.
+   */
+  export interface GenericEdge {
+    /** Name of the source node. */
+    source: string;
+    /** Name of the target node. */
+    target: string;
+    /** Attributes of the edge. */
+    edge_attr: Array<{[key: string]: any}>;
+  }
+
+  /**
+   * GenericNode corresponds to the Node message in graph_explorer.proto.
+   */
+  export interface GenericNode {
+    /** Name of the node */
+    name: string;
+    /** Attributes of a leaf node or leaf nodes within a metanode. */
+    node_attr: Array<{[key: string]: any}>;
+    /** Attributes of a metanode. */
+    metanode_attr: Array<{[key: string]: any}>;
+  }
+
+  /**
+   * TensorFlow stats file definition as defined in the stats proto file.
+   */
+  export interface StepStats {
+    dev_stats: {device: string, node_stats: NodeExecStats[]}[];
+  }
+
+  /**
+   * TensorFlow stats for a node as defined in the step_stats proto file.
+   */
+  export interface NodeExecStats {
+    node_name: string;
+    // The next 4 properties are currently stored as string in json
+    // and must be parsed.
+    all_start_micros: number;
+    op_start_rel_micros: number;
+    op_end_rel_micros: number;
+    all_end_rel_micros: number;
+    memory: {
+      allocator_name: string;
+      total_bytes: number;  // Stored as string in json and should be parsed.
+      peak_bytes: number;   // Stored as string in json and should be parsed.
+    }[];
+    /** Output sizes recorded for a single execution of a graph node */
+    output: NodeOutput[];
+    timeline_label: string;
+    scheduled_micros: string;
+    thread_id: string;
+  }
+
+  /**
+   * Description for the output tensor(s) of an operation in the graph as
+   * defined in the step_stats.proto file.
+   */
+  export interface NodeOutput {
+    slot: number;  // Stored as string in json and should be parsed.
+    tensor_description: {
+      /** Data type of tensor elements */
+      dtype: string;
+      /** Shape of the tensor */
+      shape: {
+        /**
+         * Dimensions of the tensor, such as [{name: 'input', size: 30},
+         * {name: 'output', size: 40}] for a 30 x 40 2D tensor.  The names
+         * are optional. The order of entries in 'dim' matters: It indicates
+         * the layout of the values in the tensor in-memory representation.
+         */
+        dim: {
+          /** Size of the tensor in that dimension */
+          size: number,  // Stored as string in json and should be parsed.
+          /** Optional name of the tensor dimension */
+          name?: string
+        }[];
+      };
+      /** Information about the size and allocator used for the data */
+      allocation_description: {
+        // The next 2 properties are stored as string in json and
+        // should be parsed.
+        /** Total number of bytes requested */
+        requested_bytes: number;
+        /** Total number of bytes allocated, if known */
+        allocated_bytes?: number;
+        /** Name of the allocator used */
+        allocator_name: string;
+      };
+    };
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/render.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/render.ts
new file mode 100644
index 0000000000000000000000000000000000000000..474e358ba95f3cd00ccadb1ce7a3535341030c1e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/render.ts
@@ -0,0 +1,1633 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/**
+ * Package for the Render Hierarchy for TensorFlow graph.
+ */
+module tf.graph.render {
+
+export type Point = {x: number, y: number};
+
+/**
+ * Color parameters for op nodes.
+ */
+export let OpNodeColors = {DEFAULT_FILL: 'white', DEFAULT_STROKE: '#b2b2b2'};
+
+/**
+ * Color parameters for node encoding.
+ * @type {Object}
+ */
+export let MetanodeColors = {
+  /**
+   * Default fill and stroke to use when no other information is available.
+   */
+  DEFAULT_FILL: '#d9d9d9',
+  DEFAULT_STROKE: '#a6a6a6',
+  SATURATION: 0.6,
+  LIGHTNESS: 0.85,
+  /**
+   * Neutral color to use when the node is expanded (used when coloring by
+   * compute time, memory and device).
+   */
+  EXPANDED_COLOR: '#f0f0f0',
+  /**
+   * Standard hue values for node color palette.
+   */
+  HUES: [220, 100, 180, 40, 20, 340, 260, 300, 140, 60],
+  STRUCTURE_PALETTE(id: number, lightened?: boolean) {
+    // The code below is a flexible way to computationally create a set
+    // of colors that go well together.
+    let hues = MetanodeColors.HUES;
+    let n = hues.length;
+    let hue = hues[id % n];
+    let m = Math.sin(hue * Math.PI / 360);
+    let sat = lightened ? 30 : 90 - 60 * m;
+    let light = lightened ? 95 : 80;
+    return d3.hsl(hue, .01 * sat, .01 * light).toString();
+  },
+  DEVICE_PALETTE(index: number): string {
+    return MetanodeColors.STRUCTURE_PALETTE(index);
+  },
+  XLA_CLUSTER_PALETTE(index: number): string {
+    return MetanodeColors.STRUCTURE_PALETTE(index);
+  },
+  UNKNOWN: '#eee',
+  GRADIENT_OUTLINE: '#888'
+};
+
+/**
+ * Color parameters for op nodes.
+ */
+export let SeriesNodeColors = {
+  DEFAULT_FILL: 'white',
+  DEFAULT_STROKE: '#b2b2b2'
+};
+
+/**
+ * Parameters that affect how the graph is rendered on the screen.
+ */
+const PARAMS = {
+  /**
+   * Whether to extract high degree nodes from the core part of the graph.
+   */
+  enableExtraction: true,
+  /**
+   * The minimum number of nodes for a graph to have in order for high in and
+   * out degree nodes to be extracted in auxiliary. The aim here is to prevent
+   * nodes from being extracted from small graphs.
+   */
+  minNodeCountForExtraction: 15,
+  /**
+   * The minimum in or out degree a node must have in order to be possibly
+   * extracted.
+   */
+  minDegreeForExtraction: 5,
+  /**
+   * Maximum number of control edges a node can have before they aren't
+   * displayed.
+   */
+  maxControlDegree: 4,
+  /**
+   * Maximum in (for outbound bridge paths) or out (for inbound bridge paths)
+   * degree of a node allowed for a bridge path to be rendered to it from a
+   * subhierarchy of nodes. Having a max prevents having too many nodes emanate
+   * from a subhierarchy and crowding up.
+   */
+  maxBridgePathDegree: 4,
+  /**
+   * Types patterns for predefined out-extract nodes, which are
+   * sink-like nodes that will be extracted from the main graph.
+   */
+  outExtractTypes: [
+    'NoOp'  // NoOps are sink-like used for managing control dependencies.
+  ],
+
+  /**
+   * Types patterns for predefined in-extract nodes, which are
+   * source-like nodes that will be extracted from the main graph.
+   */
+  inExtractTypes: [],
+
+  /**
+   * When removing edges from a high degree node, remove all of its edges if
+   * detachAllEdgesForHighDegree is true.  Otherwise remove all in-edges if
+   * the node has high in-degree, or all out-edges if the node has high
+   * out-degree.
+   */
+  detachAllEdgesForHighDegree: true,
+
+  /**
+   * After extracting high in/out degree nodes and predefined
+   * source-like/sink-like, extract isolated nodes to the side
+   * if this extractIsolatedNodesWithAnnotationsOnOneSide is true.
+   */
+  extractIsolatedNodesWithAnnotationsOnOneSide: true,
+
+  /**
+   * Whether to add bridge nodes and edges to the core when building the
+   * subhierarchy of an expanded metanode. See buildSubhierarchy().
+   */
+  enableBridgegraph: true,
+
+  /**
+   * 2 colors, for the minimum and maximum value respectively, whenever we
+   * have a gradient scale.
+   */
+  minMaxColors: ['#fff5f0', '#fb6a4a'],
+
+  /**
+   * Maximum number of annotations to be displayed on a node before an
+   * ellipsis is used.
+   */
+  maxAnnotations: 5
+};
+
+/**
+ * Stores the rendering information, such as x and y coordinates,
+ * for each node in the graph.
+ */
+export class RenderGraphInfo {
+  hierarchy: hierarchy.Hierarchy;
+  private displayingStats: boolean;
+  private index: {[nodeName: string]: RenderNodeInfo};
+  private renderedOpNames: string[];
+  private deviceColorMap: d3.ScaleOrdinal<string, string>;
+  private xlaClusterColorMap: d3.ScaleOrdinal<string, string>;
+  private memoryUsageScale: d3.ScaleLinear<string, string>;
+  private computeTimeScale: d3.ScaleLinear<string, string>;
+  /** Scale for the thickness of edges when there is no shape information. */
+  edgeWidthScale:
+      d3.ScaleLinear<number, number> | d3.ScalePower<number, number>;
+  // Since the rendering information for each node is constructed lazily,
+  // upon node's expansion by the user, we keep a map between the node's name
+  // and whether the rendering information was already constructed for that
+  // node.
+  private hasSubhierarchy: {[nodeName: string]: boolean};
+  root: RenderGroupNodeInfo;
+  traceInputs: Boolean;
+
+  constructor(hierarchy: hierarchy.Hierarchy, displayingStats: boolean) {
+    this.hierarchy = hierarchy;
+    this.displayingStats = displayingStats;
+    this.index = {};
+    this.renderedOpNames = [];
+
+    this.computeScales();
+    // Maps node name to whether the rendering hierarchy was already
+    // constructed.
+    this.hasSubhierarchy = {};
+    this.root = new RenderGroupNodeInfo(hierarchy.root);
+    this.index[hierarchy.root.name] = this.root;
+    this.renderedOpNames.push(hierarchy.root.name);
+    this.buildSubhierarchy(hierarchy.root.name);
+    this.root.expanded = true;
+    this.traceInputs = false;
+  }
+
+  computeScales() {
+    this.deviceColorMap = d3.scaleOrdinal<string>()
+        .domain(this.hierarchy.devices)
+        .range(_.map(d3.range(this.hierarchy.devices.length),
+                     MetanodeColors.DEVICE_PALETTE));
+
+    this.xlaClusterColorMap =
+        d3.scaleOrdinal<string>()
+            .domain(this.hierarchy.xlaClusters)
+            .range(_.map(
+                d3.range(this.hierarchy.xlaClusters.length),
+                MetanodeColors.XLA_CLUSTER_PALETTE));
+
+    let topLevelGraph = this.hierarchy.root.metagraph;
+    // Find the maximum and minimum memory usage.
+    let memoryExtent = d3.extent(topLevelGraph.nodes(),
+        (nodeName, index) => {
+      let node = topLevelGraph.node(nodeName);
+      // Some ops don't have stats at all.
+      if (node.stats != null) {
+        return node.stats.totalBytes;
+      }
+    });
+    this.memoryUsageScale = d3.scaleLinear<string, string>()
+        .domain(memoryExtent)
+        .range(PARAMS.minMaxColors);
+
+    // Find also the minimum and maximum compute time.
+    let computeTimeExtent = d3.extent(topLevelGraph.nodes(),
+        (nodeName, index) => {
+      let node = topLevelGraph.node(nodeName);
+      // Some ops don't have stats at all.
+      if (node.stats != null) {
+        return node.stats.getTotalMicros();
+      }
+    });
+    this.computeTimeScale = d3.scaleLinear<string, string>()
+        .domain(computeTimeExtent)
+        .range(PARAMS.minMaxColors);
+
+    this.edgeWidthScale = this.hierarchy.hasShapeInfo ?
+      scene.edge.EDGE_WIDTH_SCALE :
+      d3.scaleLinear()
+        .domain([1, this.hierarchy.maxMetaEdgeSize])
+        .range([scene.edge.MIN_EDGE_WIDTH, scene.edge.MAX_EDGE_WIDTH]);
+  }
+
+  /**
+   * Get a previously created RenderNodeInfo by its node name.
+   */
+  getRenderNodeByName(nodeName: string): RenderNodeInfo {
+    return this.index[nodeName];
+  }
+
+  /**
+   * Get the underlying node in the hierarchical graph by its name.
+   */
+  getNodeByName(nodeName: string): Node {
+    return this.hierarchy.node(nodeName);
+  }
+
+  /**
+   * Get a previously created RenderNodeInfo for the specified node name,
+   * or create one if it hasn't been created yet.
+   */
+  getOrCreateRenderNodeByName(nodeName: string): RenderNodeInfo {
+    // Polymer may invoke this with null.
+    if (!nodeName) {
+      return null;
+    }
+
+    if (nodeName in this.index) {
+      return this.index[nodeName];
+    }
+
+    let node = this.hierarchy.node(nodeName);
+    // Exit early if the node does not exist in the hierarchy. This can happen
+    // when a graph is reloaded while the infocard points to a node not visible
+    // at the top-level.
+    if (!node) {
+      return null;
+    }
+    let renderInfo = node.isGroupNode ?
+        new RenderGroupNodeInfo(<GroupNode>node) :
+        new RenderNodeInfo(node);
+    this.index[nodeName] = renderInfo;
+    this.renderedOpNames.push(nodeName);
+
+    if (node.stats) {
+      renderInfo.memoryColor = this.memoryUsageScale(node.stats.totalBytes);
+      renderInfo.computeTimeColor =
+          this.computeTimeScale(node.stats.getTotalMicros());
+    }
+
+    if (!node.isGroupNode) {
+      let clusterName = (node as OpNode).xlaCluster;
+      if (clusterName) {
+        renderInfo.xlaClusterColor = this.xlaClusterColorMap(clusterName);
+      }
+    }
+
+    // We only fade nodes when we're displaying stats.
+    renderInfo.isFadedOut = this.displayingStats &&
+        !tf.graph.util.hasDisplayableNodeStats(node.stats);
+
+    if (node.isGroupNode) {
+      // Make a list of tuples (device, proportion), where proportion
+      // is the fraction of op nodes that have that device.
+      let pairs = _.pairs((<GroupNode>node).deviceHistogram);
+      if (pairs.length > 0) {
+        // Compute the total # of devices.
+        let numDevices = _.sum(pairs, _.last);
+        renderInfo.deviceColors = _.map(pairs, pair => ({
+              color: this.deviceColorMap(pair[0]),
+              // Normalize to a proportion of total # of devices.
+              proportion: pair[1] / numDevices
+            }));
+      }
+    } else {
+      let device = (<OpNode>renderInfo.node).device;
+      if (device) {
+        renderInfo.deviceColors = [{
+          color: this.deviceColorMap(device),
+          proportion: 1.0
+        }];
+      }
+    }
+
+    return this.index[nodeName];
+  }
+
+  /**
+   * Return the nearest ancestor node, including itself, that is visible
+   * in the visualization. This method is used so that we can select
+   * (highlight) a node that isn't drawn yet, by selecting (highlighting)
+   * its nearest ancestor that has been drawn.
+   */
+  getNearestVisibleAncestor(name: string): string {
+    let path = getHierarchicalPath(name);
+    for (let i = 0; i < path.length; i++) {
+      let nodeName = path[i];
+      // Op nodes have expanded set to false by default.
+      if (!this.getRenderNodeByName(nodeName).expanded) {
+        return nodeName;
+      }
+    }
+    // Fallthrough. If everything was expanded return the node.
+    return name;
+  }
+
+  // TODO(jimbo): Delete this an any code it touches (all deprecated).
+  setDepth(depth: number): void {
+    setGroupNodeDepth(this.root, +depth);
+  }
+
+  /**
+   * Returns true if the renderNode is an isolated node within its parent node.
+   */
+  isNodeAuxiliary(renderNode: RenderNodeInfo): boolean {
+    let parentNode = <RenderGroupNodeInfo>this.getRenderNodeByName(
+      renderNode.node.parentNode.name);
+    let found = _.find(parentNode.isolatedInExtract, node => {
+      return node.node.name === renderNode.node.name;
+    });
+    if (found) {
+      return true;
+    }
+    found = _.find(parentNode.isolatedOutExtract, node => {
+      return node.node.name === renderNode.node.name;
+    });
+    return !!found;
+  }
+
+  /**
+   * Returns a list of ops that have been rendered so far for this graph. More
+   * ops may later be rendered if the user expands nodes for instance. The list
+   * returned here can only stay the same size or grow on successive calls.
+   */
+  getNamesOfRenderedOps(): string[] {
+    return this.renderedOpNames;
+  }
+
+  buildSubhierarchy(nodeName: string): void {
+    // Terminate if the rendering hierarchy was already constructed
+    // for this node.
+    if (nodeName in this.hasSubhierarchy) {
+      return;
+    }
+
+    let renderNodeInfo = this.index[nodeName];
+
+    // If it is not a meta node or a series node, don't do anything.
+    if (renderNodeInfo.node.type !== NodeType.META &&
+        renderNodeInfo.node.type !== NodeType.SERIES) {
+      return;
+    }
+
+    // At this point we know the rendering information is about a group node.
+    let renderGroupNodeInfo = <RenderGroupNodeInfo> renderNodeInfo;
+    let metagraph = renderGroupNodeInfo.node.metagraph;
+    let coreGraph = renderGroupNodeInfo.coreGraph;
+
+    // Create render nodes to represent each child from the metagraph. Although
+    // these will initially be added to the coreGraph, they may later be
+    // extracted. Also, due to extraction, the coreGraph may contain disjoint
+    // groups between which there is no visible path (other than annotations).
+    _.each(metagraph.nodes(), childName => {
+
+      let childRenderInfo = this.getOrCreateRenderNodeByName(childName);
+      let childNode = childRenderInfo.node;
+
+      coreGraph.setNode(childName, childRenderInfo);
+
+      if (!childNode.isGroupNode) {
+        _.each((<OpNode>childNode).inEmbeddings, embedding => {
+          let renderMetaedgeInfo = new RenderMetaedgeInfo(null);
+          addInAnnotation(childRenderInfo, embedding, null, renderMetaedgeInfo,
+              AnnotationType.CONSTANT);
+          this.index[embedding.name] = new RenderNodeInfo(embedding);
+        });
+        _.each((<OpNode>childNode).outEmbeddings, embedding => {
+          let renderMetaedgeInfo = new RenderMetaedgeInfo(null);
+          addOutAnnotation(childRenderInfo, embedding, null, renderMetaedgeInfo,
+              AnnotationType.SUMMARY);
+          this.index[embedding.name] = new RenderNodeInfo(embedding);
+        });
+      }
+
+    });
+
+    // Add render metaedge info for edges in the metagraph.
+    _.each(metagraph.edges(), edgeObj => {
+      let metaedge = metagraph.edge(edgeObj);
+      let renderMetaedgeInfo = new RenderMetaedgeInfo(metaedge);
+      renderMetaedgeInfo.isFadedOut =
+          this.index[edgeObj.v].isFadedOut || this.index[edgeObj.w].isFadedOut;
+      coreGraph.setEdge(edgeObj.v, edgeObj.w, renderMetaedgeInfo);
+    });
+
+    if (PARAMS.enableExtraction &&
+        renderGroupNodeInfo.node.type === NodeType.META) {
+      extractHighDegrees(renderGroupNodeInfo);
+    }
+
+    // Record that we constructed the rendering hierarchy for this node, so we
+    // don't construct it another time.
+    this.hasSubhierarchy[nodeName] = true;
+
+    // Look up the parent node's render information and short circuit if none.
+    let parentNode = renderGroupNodeInfo.node.parentNode;
+    if (!parentNode) {
+      return;
+    }
+    let parentNodeInfo =
+      <RenderGroupNodeInfo> this.index[parentNode.name];
+
+    // Utility function for computing the name of a bridge node.
+    let getBridgeNodeName = (inbound, ...rest) =>
+        rest.concat([inbound ? 'IN' : 'OUT']).join('~~');
+
+    // Build out the bridgegraph.
+    let bridgegraph = this.hierarchy.getBridgegraph(nodeName);
+
+    // Look for popular nodes so we can make annotations instead of paths.
+    let otherCounts = {
+      // Counts of edges coming INTO other nodes by name (outgoing from self).
+      in: <{[nodeName: string]: number}> {},
+      // Counts of edges going OUT from other nodes by name (coming into self).
+      out: <{[nodeName: string]: number}> {},
+      // Counts of all control edges involving other nodes by name.
+      control: <{[nodeName: string]: number}> {},
+    };
+    _.each(bridgegraph.edges(), e => {
+      // An edge is inbound if its destination node is in the metagraph.
+      let inbound = !!metagraph.node(e.w);
+      let otherName = inbound ? e.v : e.w;
+      let metaedge = bridgegraph.edge(e);
+      if (!metaedge.numRegularEdges) {
+        otherCounts.control[otherName] =
+          (otherCounts.control[otherName] || 0) + 1;
+      } else if (inbound) {
+        otherCounts.out[otherName] = (otherCounts.out[otherName] || 0) + 1;
+      } else {
+        otherCounts.in[otherName] = (otherCounts.in[otherName] || 0) + 1;
+      }
+    });
+
+    // Add annotations and edges for bridgegraph relationships.
+    let hierarchyNodeMap = this.hierarchy.getNodeMap();
+    _.each(bridgegraph.edges(), bridgeEdgeObj => {
+      let bridgeMetaedge = bridgegraph.edge(bridgeEdgeObj);
+
+      // Determine whether this bridge edge is incoming by checking the
+      // metagraph for a node that matches the destination end.
+      let inbound = !!metagraph.node(bridgeEdgeObj.w);
+
+      // Based on the direction of the edge, one endpoint will be an immediate
+      // child of this renderNodeInfo, and the other endpoint will be a sibling
+      // of the parent (or an ancestor further up).
+      let [childName, otherName] =
+        inbound ?
+          [bridgeEdgeObj.w, bridgeEdgeObj.v] :
+          [bridgeEdgeObj.v, bridgeEdgeObj.w];
+
+      let childRenderInfo = this.index[childName];
+      let otherRenderInfo = this.index[otherName];
+      let otherNode =
+        otherRenderInfo ?
+          otherRenderInfo.node :
+          hierarchyNodeMap[otherName];
+
+      // Determine whether this edge is a control edge between nodes where
+      // either node is high-degree with respect to control edges. This will
+      // be a signal to show it as an annotation instead of a bridge edge.
+      let isHighDegreeControlEdge = !bridgeMetaedge.numRegularEdges &&
+        otherCounts.control[otherName] > PARAMS.maxControlDegree;
+
+      let [, childAnnotations] =
+        inbound ?
+          [renderNodeInfo.inAnnotations, childRenderInfo.inAnnotations] :
+          [renderNodeInfo.outAnnotations, childRenderInfo.outAnnotations];
+
+      // Don't render a bridge path if the other node has in or out degree above
+      // a threshold, lest bridge paths emanating out of a metagraph crowd up,
+      // as was the case for the Fatcat LSTM lstm_1 > lstm_1 metagraph.
+      let otherDegreeCount =
+          (inbound ? otherCounts.out : otherCounts.in)[otherName];
+      let isOtherHighDegree = otherDegreeCount > PARAMS.maxBridgePathDegree;
+
+      // The adjoining render metaedge info from the parent's coreGraph, if any.
+      // It will either be a Metaedge involving this node directly, if it
+      // previously came from a metagraph, or it'll be a Metaedge involving
+      // a previously created bridge node standing in for the other node.
+      let adjoiningMetaedge = null;
+
+      // We can only hope to render a bridge path if:
+      //  - bridgegraph paths are enabled,
+      //  - the other node is not too high-degree,
+      //  - the child is in the core (not extracted for being high-degree), and
+      //  - there's a path (in the traversal sense) between child and other.
+      let canDrawBridgePath = false;
+      if (PARAMS.enableBridgegraph &&
+          !isOtherHighDegree &&
+          !isHighDegreeControlEdge &&
+          childRenderInfo.isInCore()) {
+
+        // Utility function for finding an adjoining metaedge.
+        let findAdjoiningMetaedge = targetName => {
+          let adjoiningEdgeObj: graphlib.EdgeObject =
+            inbound ?
+              { v: targetName, w: nodeName } :
+              { v: nodeName, w: targetName };
+          return <RenderMetaedgeInfo>
+            parentNodeInfo.coreGraph.edge(adjoiningEdgeObj);
+        };
+
+        adjoiningMetaedge = findAdjoiningMetaedge(otherName);
+        if (!adjoiningMetaedge) {
+          adjoiningMetaedge = findAdjoiningMetaedge(
+              getBridgeNodeName(inbound, otherName, parentNode.name));
+        }
+
+        canDrawBridgePath = !!adjoiningMetaedge;
+      }
+
+      // Although dataflow edges are acyclic, control dependency edges may
+      // actually point 'backwards' in the graph. If this bridgeMetaedge is
+      // a control dependency, we need to determine whether it's backwards
+      // pointing so that we render it appropriately.
+      //
+      // For instance, say we're rendering a graph with nodes named A/B and Z/Y,
+      // and we're currently rendering the bridgegraph for A. Further, let's say
+      // that there was an original BaseEdge from A/B->Z/Y and a CONTROL EDGE
+      // from Z/Y=>A/B.
+      //
+      //     +----------------+
+      //     | A              |
+      //     |  +-----+       |         +------+
+      //     |  | B   |>----->|>------->| Z    |
+      //     |  |     |       |         |      |
+      //     |  |     |   *   |         |      |
+      //     |  |     |<=====<|<=======<|      |
+      //     |  +-----+       |         +------+
+      //     +----------------+
+      //
+      // When we render the subhierarchy for Metanode A, we'll come across a
+      // control-only Metaedge in the bridgegraph from Z=>A/B (*). The question
+      // is whether this edge is backwards.
+      //
+      // To answer that question, we follow the chain of adjoining metaedges
+      // until we reach the topmost one. In this case, that's the control-only
+      // Metaedge Z=>A in the ROOT's metagraph. We determine that this edge
+      // is backwards by looking at the topological ordering of ROOT's metagraph
+      // (which ignores control edges) and seeing that Z comes AFTER A.
+      //
+      // The property of being backwards is independent of whether the edge
+      // is inbound or outbound. In the preceding example, if we were building
+      // the subhierarchy for Z, we'd find bridge edge Z/Y=>A, walk to its
+      // topmost adjoining metaedge Z=>A and discover that it's backwards.
+      let backwards = false;
+      if (adjoiningMetaedge && !bridgeMetaedge.numRegularEdges) {
+        // Find the top-most adjoining render metaedge information, and the
+        // GroupNode whose metagraph must contain the associated metaedge.
+        let topAdjoiningMetaedge = adjoiningMetaedge;
+        let topGroupNode = parentNodeInfo.node;
+        while (topAdjoiningMetaedge.adjoiningMetaedge) {
+          topAdjoiningMetaedge = topAdjoiningMetaedge.adjoiningMetaedge;
+          topGroupNode = <GroupNode>topGroupNode.parentNode;
+        }
+
+        // Check against the topological ordering for the top node. The current
+        // bridge metaedge we're evaluating is backwards if its source comes
+        // after its destination.
+        let ordering = this.hierarchy.getTopologicalOrdering(topGroupNode.name);
+        let e = topAdjoiningMetaedge.metaedge;
+        backwards = ordering[e.v] > ordering[e.w];
+      }
+
+      // Render backwards control edges as annotations.
+      canDrawBridgePath = canDrawBridgePath && !backwards;
+
+      // If we can't make a bridge path for any reason, then we add an
+      // annotation instead.
+      if (!canDrawBridgePath) {
+        childAnnotations.push(new Annotation(
+            otherNode,
+            otherRenderInfo,
+            new RenderMetaedgeInfo(bridgeMetaedge),
+            AnnotationType.SHORTCUT,
+            inbound));
+        return;
+      }
+
+      // At this point, all conditions have been met for drawing a bridge path.
+
+      // Find or create the IN/OUT node representing otherNode.
+      let bridgeContainerName = getBridgeNodeName(inbound, nodeName);
+      let bridgeNodeName = getBridgeNodeName(inbound, otherName, nodeName);
+      let bridgeNodeRenderInfo = coreGraph.node(bridgeNodeName);
+      if (!bridgeNodeRenderInfo) {
+
+        // Find or create the directional container for the bridge node.
+        let bridgeContainerInfo = coreGraph.node(bridgeContainerName);
+        if (!bridgeContainerInfo) {
+          let bridgeContainerNode: BridgeNode = {
+            // Important node properties.
+            name: bridgeContainerName,
+            type: NodeType.BRIDGE,
+            // Unused node properties.
+            isGroupNode: false,
+            cardinality: 0,
+            parentNode: null,
+            stats: null,
+            include: InclusionType.UNSPECIFIED,
+            // BridgeNode properties.
+            inbound: inbound,
+            nodeAttributes: {},
+          };
+          bridgeContainerInfo =
+            new RenderNodeInfo(bridgeContainerNode);
+          this.index[bridgeContainerName] = bridgeContainerInfo;
+          coreGraph.setNode(bridgeContainerName, bridgeContainerInfo);
+        }
+
+        let bridgeNode: BridgeNode = {
+          // Important node properties.
+          name: bridgeNodeName,
+          type: NodeType.BRIDGE,
+          // Unimportant node properties.
+          isGroupNode: false,
+          cardinality: 1,
+          parentNode: null,
+          stats: null,
+          include: InclusionType.UNSPECIFIED,
+          // BridgeNode properties.
+          inbound: inbound,
+          nodeAttributes: {},
+        };
+        bridgeNodeRenderInfo = new RenderNodeInfo(bridgeNode);
+        this.index[bridgeNodeName] = bridgeNodeRenderInfo;
+        coreGraph.setNode(bridgeNodeName, bridgeNodeRenderInfo);
+
+        // Set bridgeNode to be a graphlib child of the container node.
+        coreGraph.setParent(bridgeNodeName, bridgeContainerName);
+        bridgeContainerInfo.node.cardinality++;
+      }
+
+      // Create and add a bridge render metaedge.
+      let bridgeRenderMetaedge =
+        new RenderMetaedgeInfo(bridgeMetaedge);
+      bridgeRenderMetaedge.adjoiningMetaedge = adjoiningMetaedge;
+      inbound ?
+        coreGraph.setEdge(bridgeNodeName, childName, bridgeRenderMetaedge) :
+        coreGraph.setEdge(childName, bridgeNodeName, bridgeRenderMetaedge);
+
+    }); // End _.each(bridgegraph.edges).
+
+    // For each bridge container (IN and/or OUT), add structural edges between
+    // terminal nodes and that container. A terminal node is one which has no
+    // non-bridge edges in the direction of the container.
+    //
+    // For example, consider a Metanode A which contains two child nodes A/B
+    // and A/C. Let's say it has one edge in the metagraph from A/B->A/C, and
+    // one edge in the bridgegraph from Z->A/C.
+    //
+    // At this point, we've added a container bridge node IN to house all
+    // incoming bridge nodes. We've also added a bridge node Z' (with parent IN)
+    // to A, and a bridge edge from Z'->C.
+    //
+    //     +----------------------+
+    //     | A          +---+     |
+    //     |    +------>| C |     |
+    //     |    |       +---+     |
+    //     |    |         ^       |
+    //     |    |         |       |
+    //     |    |    +----|----+  |
+    //     |    |    | IN |    |  |
+    //     |  +---+  |  +---+  |  |
+    //     |  | B |  |  | Z'|  |  |
+    //     |  +---+  |  +---+  |  |
+    //     |         +---------+  |
+    //     +----------------------+
+    //
+    // With no other help, dagre would lay out B and Z' on the same level,
+    // because both of them have no incoming edges. In other words, B is a
+    // terminal node in the INCOMING direction.
+    //
+    // But we want to force dagre to lay out Z' (and everything in IN) lower
+    // than all non-bridge nodes, so that there's enough room for the bridge
+    // edges after they've been adjusted to meet up with paths coming in from
+    // outside.
+    //
+    // To force Z' (and all other bridge nodes) to be lowest in the graph, we
+    // identify terminal nodes like B and give them structural edges to
+    // a new structural bridge node S which we add to IN.
+    //
+    //     +----------------------+
+    //     | A          +---+     |
+    //     |       +--->| C |     |
+    //     |       |    +---+     |
+    //     |     +---+    ^       |
+    //     |     | B |    |       |
+    //     |     +---+    |       |
+    //     |       ^      |       |
+    //     |       |      |       |
+    //     |  +----|------|----+  |
+    //     |  |IN  |      |    |  |
+    //     |  |  +---+  +---+  |  |
+    //     |  |  | S |  | Z'|  |  |
+    //     |  |  +---+  +---+  |  |
+    //     |  +----------------+  |
+    //     +----------------------+
+    //
+    // This ensures that dagre will lay out the bridge containers strictly at
+    // the ends of the graph. The structural edges will never be seen in the
+    // visualization except as a debugging aid.
+    _.each([true, false], inbound => {
+      let bridgeContainerName = getBridgeNodeName(inbound, nodeName);
+      let bridgeContainerInfo = coreGraph.node(bridgeContainerName);
+      if (!bridgeContainerInfo) {
+        return;
+      }
+      _.each(coreGraph.nodes(), childName => {
+        // Short-circuit if this child is a bridge node or it's not a terminal
+        // node in the direction we're interested in.
+        let childNodeInfo = coreGraph.node(childName);
+        if (childNodeInfo.node.type === NodeType.BRIDGE) {
+          return;
+        }
+        let isTerminal = inbound ?
+          !coreGraph.predecessors(childName).length :
+          !coreGraph.successors(childName).length;
+        if (!isTerminal) {
+          return;
+        }
+
+        // Find or create a bridge node in the container for all structural
+        // metaedges. It would have been nice to skip this step and simply
+        // set a metaedge between the terminal node and the container node, but
+        // in that case, something about the graph upsets dagre.layout()'s
+        // longestPath algorithm (was getting errors due to an undefined).
+        let structuralNodeName =
+            getBridgeNodeName(inbound, nodeName, 'STRUCTURAL_TARGET');
+        let structuralRenderInfo = coreGraph.node(structuralNodeName);
+        if (!structuralRenderInfo) {
+          let bridgeNode: BridgeNode = {
+            // Important Node properties.
+            name: structuralNodeName,
+            type: NodeType.BRIDGE,
+            // Unimportant Node properties.
+            isGroupNode: false,
+            cardinality: 1,
+            parentNode: null,
+            stats: null,
+            include: InclusionType.UNSPECIFIED,
+            // BridgeNode properties.
+            inbound: inbound,
+            nodeAttributes: {},
+          };
+          structuralRenderInfo = new RenderNodeInfo(bridgeNode);
+          structuralRenderInfo.structural = true;
+          this.index[structuralNodeName] = structuralRenderInfo;
+          coreGraph.setNode(structuralNodeName, structuralRenderInfo);
+          bridgeContainerInfo.node.cardinality++;
+          coreGraph.setParent(structuralNodeName, bridgeContainerName);
+        }
+
+        // Create the structural Metaedge and insert it.
+        let structuralMetaedgeInfo = new RenderMetaedgeInfo(null);
+        structuralMetaedgeInfo.structural = true;
+        structuralMetaedgeInfo.weight--; // Reduce weight for dagre layout.
+        inbound ?
+          coreGraph.setEdge(
+              structuralNodeName, childName, structuralMetaedgeInfo) :
+          coreGraph.setEdge(
+              childName, structuralNodeName, structuralMetaedgeInfo);
+      });
+    });
+  }
+}
+
+/**
+ * A class for rendering annotation object which contains label
+ * about the node embedded as annotation, type of annotation and the location
+ * of both the annotation's node and edge.
+ *
+ * Annotation objects include embedded constants, embedded summary, and
+ * edge shortcuts.
+ */
+export class Annotation {
+  node: Node;
+  renderNodeInfo: RenderNodeInfo;
+  renderMetaedgeInfo: RenderMetaedgeInfo;
+  annotationType: AnnotationType;
+  /**
+   * Center position of annotation relative to the host
+   * node's center x.
+   */
+  dx: number;
+  /**
+   * Center position of annotation relative to the host
+   * node's center y.
+   */
+  dy: number;
+  width: number;
+  height: number;
+  /**
+   * The names of nodes on either side of this edge.
+   */
+  v: string;
+  w: string;
+  /**
+   * A flag whether it is an in-annotation (if true) or
+   * out-annotation  (if false).
+   */
+  isIn: boolean;
+  /** Label horizontal offset from the end of the node shape */
+  labelOffset: number;
+  /**
+   * Array of points for edges from the annotation to its host
+   * node. Each point contains the point location, relative to
+   * the host node's center.
+   */
+  points: {dx: number, dy: number}[];
+
+  /**
+   * Creates a new Annotation.
+   *
+   * @param node The underlying node this annotation points to.
+   * @param renderNodeInfo The render information for the underlying node
+   *     this annotation points to. This can be null if the annotation
+   *     denotes an embedding (constant, summary), in which case we
+   *     use the node property.
+   * @param renderMetaedgeInfo The render information for the edge associated
+   *     with the annotation.
+   * @param type The type of the annotation.
+   * @param isIn True if it is an in-annotation. False if it is an
+   *     out-annotation.
+   */
+  constructor(node: Node, renderNodeInfo: RenderNodeInfo,
+      renderMetaedgeInfo: RenderMetaedgeInfo, type: AnnotationType,
+      isIn: boolean) {
+    this.node = node;
+    this.renderNodeInfo = renderNodeInfo;
+    this.renderMetaedgeInfo = renderMetaedgeInfo;
+    this.annotationType = type;
+    // Properties specified by layout
+    this.dx = 0;
+    this.dy = 0;
+    this.width = 0;
+    this.height = 0;
+    // Properties needed for generating an ID for the edge's path element if
+    // this annotation is associated with a metaedge.
+    if (renderMetaedgeInfo && renderMetaedgeInfo.metaedge) {
+      this.v = renderMetaedgeInfo.metaedge.v;
+      this.w = renderMetaedgeInfo.metaedge.w;
+    }
+
+    this.isIn = isIn;
+    this.points = [];
+  }
+};
+
+export enum AnnotationType {SHORTCUT, CONSTANT, SUMMARY, ELLIPSIS};
+
+/**
+ * Manages a list of annotations. Two will be used for each
+ * RenderNodeInfo, one for in annotations and one for out annotations.
+ */
+export class AnnotationList {
+  /**
+   * List of visually drawable annotations, may include an ellipses annotation
+   * if the number added exceeds the number specified by maxAnnotations.
+   */
+  list: Annotation[];
+
+  /**
+   * Set of nodes which have been added as annotations to this list, so we can
+   * prevent duplicates.
+   */
+  nodeNames: { [nodeName: string]: boolean };
+
+  constructor() {
+    this.list = [];
+    this.nodeNames = {};
+  }
+
+  /**
+   * Append an annotation to the list, or a stand-in ellipsis annotation instead
+   * if this would make it too many.
+   */
+  push(annotation: Annotation): void {
+    if (annotation.node.name in this.nodeNames) {
+      return; // Skip duplicate annotation.
+    }
+    this.nodeNames[annotation.node.name] = true;
+
+    if (this.list.length < PARAMS.maxAnnotations) {
+      this.list.push(annotation);
+      return;
+    }
+
+    let lastAnnotation = this.list[this.list.length - 1];
+    if (lastAnnotation.annotationType === AnnotationType.ELLIPSIS) {
+      let ellipsisNode = <EllipsisNode>lastAnnotation.node;
+      ellipsisNode.setNumMoreNodes(++ellipsisNode.numMoreNodes);
+      return;
+    }
+
+    let ellipsisNode = new tf.graph.EllipsisNodeImpl(1);
+    this.list.push(new Annotation(ellipsisNode,
+        new RenderNodeInfo(ellipsisNode), null,
+        AnnotationType.ELLIPSIS, annotation.isIn));
+  }
+}
+
+/**
+ * Contains rendering information about a node in the hierarchical graph.
+ */
+export class RenderNodeInfo {
+  /** Reference to the original underlying Node from the hierarchical graph. */
+  node: Node;
+  /** Whether the node is expanded or not. */
+  expanded: boolean;
+  /**
+   * List of rendering information about in-annotations like constants and
+   * shortcuts to high-degree nodes.
+   */
+  inAnnotations: AnnotationList;
+  /**
+   * List of rendering information about out-annotations (e.g. summary nodes)
+   */
+  outAnnotations: AnnotationList;
+
+  // --- Params specified by layout --- //
+
+  /** Center x position */
+  x: number;
+  /** Center y position */
+  y: number;
+  /**
+   * Total width of the node's shape, including in- and out-annotations. This
+   * property is used by dagre to layout the graph.
+   */
+  width: number;
+  /**
+   * Total height of the node's shape, including in- and out-annotations. This
+   * property is used by dagre to layout the graph.
+   */
+  height: number;
+  /**
+   * Size of the main box of the node, excluding in- and out-annotations. This
+   * property is used to draw the rectangle/ellipse shape denoting the node.
+   */
+  coreBox: {
+    width: number,
+    height: number,
+  };
+
+  /** Width of the bounding box for all in-annotations. */
+  inboxWidth: number;
+  /** Width of the bounding box for all out-annotations. */
+  outboxWidth: number;
+  /**
+   * Whether the node should be excluded from the scene.
+   * This is only used when there are too many items in a series so we only
+   * want to include top N ones.
+   */
+  // TODO(jimbo): Now that series rendering is non-recursive, remove this and
+  // all its uses from the code base.
+  excluded: boolean;
+
+  // --- Params used in drawing the bridge paths --- //
+
+  /**
+   * All bridge nodes are meant to be invisible, but whereas most represent a
+   * relationship from the underlying graph hierarchy, some exist solely for
+   * layout reasons. Specifically, those bridge nodes which have only structural
+   * rendering metaedges.
+   */
+  structural: boolean;
+
+  // --- Params for the size of the node box --- //
+
+  /** Label vertical offset from the center of node shape */
+  labelOffset: number;
+  /** Rectangle radius (for making rounded rectangle) */
+  radius: number;
+
+  // --- Params for expanded node --- //
+
+  /** Label height for expanded node. */
+  labelHeight: number;
+  // Paddings between inner subscene and the border of the expanded node.
+  paddingTop: number;
+  paddingLeft: number;
+  paddingRight: number;
+  paddingBottom: number;
+
+  /**
+   * Whether a node is extracted as source-like (having high out-degree or
+   * matching predefined in-extract pattern.)
+   */
+  isInExtract: boolean;
+  /**
+   * Whether a node is extracted as sink-like (having high in-degree or matching
+   * predefined out-extract pattern.)
+   */
+  isOutExtract: boolean;
+
+  /**
+   * List of (color, proportion) tuples based on the proportion of devices of
+   * its children. If this node is an op node, this list will have only one
+   * color with proportion 1.0.
+   */
+  deviceColors: Array<{color: string, proportion: number}>;
+
+  /**
+   * Color according to the XLA cluster of this node.
+   */
+  xlaClusterColor: string;
+
+  /**
+   * Color according to the memory usage of this node.
+   */
+  memoryColor: string;
+
+  /**
+   * Color according to the compute time of this node.
+   */
+  computeTimeColor: string;
+
+  /**
+   * Whether this node is faded out. Used when displaying stats.
+   */
+  isFadedOut: boolean;
+
+  constructor(node: Node) {
+    this.node = node;
+    this.expanded = false;
+    this.inAnnotations = new AnnotationList();
+    this.outAnnotations = new AnnotationList();
+    // Params specified by layout
+    this.x = 0;
+    this.y = 0;
+    this.width = 0;
+    this.height = 0;
+    this.inboxWidth = 0;
+    this.outboxWidth = 0;
+
+    this.excluded = false;
+
+    // Params for bridge paths.
+    this.structural = false;
+
+    // Params for node box.
+    this.labelOffset = 0;
+    this.radius = 0;
+
+    // Params for expanded node
+    this.labelHeight = 0;
+    this.paddingTop = 0;
+    this.paddingLeft = 0;
+    this.paddingRight = 0;
+    this.paddingBottom = 0;
+    this.isInExtract = false;
+    this.isOutExtract = false;
+    this.coreBox = {width: 0, height: 0};
+
+    // By default, we don't fade nodes out. Default to false for safety.
+    this.isFadedOut = false;
+  }
+
+  isInCore(): boolean {
+    return !this.isInExtract && !this.isOutExtract;
+  }
+}
+
+/**
+ * Contains rendering information about a Metaedge from the underlying
+ * hierarchical graph. It may be from either a metagraph or a bridgegraph.
+ */
+export class RenderMetaedgeInfo {
+  /**
+   * Reference to the original underlying Metaedge from the hierarchical graph,
+   * if any. This will be null for the edges which connect OpNodes to their
+   * embeddings, for example.
+   */
+  metaedge: Metaedge;
+
+  /**
+   * Reference to the adjoining RenderMetaedgeInfo from the parent's
+   * coreGraph. This is used during layout to determine the point at which this
+   * edge should touch the node's bounding box. This property will be null for
+   * edges which terminate at a node on both ends (all non-bridge edges).
+   */
+  adjoiningMetaedge: RenderMetaedgeInfo;
+
+  /**
+   * Most of the time, a RenderMetaedgeInfo object represents a real
+   * edge between nodes in the underlying graph structure. But sometimes, an
+   * edge only exists for layout purposes. These structural edges are added
+   * during buildSubhierarchy() to force dagre.layout() to put bridge nodes
+   * at the ends of the flow.
+   * @see buildSubhierarchy()
+   */
+  structural: boolean;
+
+  /**
+   * Weight of the edge, used by dagre when deciding how important an edge is.
+   * Edges with higher weight are made shorter and straighter. The default
+   * dagre uses is 1.
+   */
+  weight: number;
+
+  /**
+   * X and Y coordinate pairs of the points in the path of the edge.
+   * @see tf.graph.node.subsceneAdjustPaths
+   */
+  points: Point[];
+
+  /**
+   * D3 selection of the group containing the path that displays this edge.
+   */
+  edgeGroup: d3.Selection<RenderMetaedgeInfo & any, any, any, any>;
+
+  /** Id of the <marker> used as a start-marker for the edge path. */
+  startMarkerId: string;
+
+  /** Id of the <marker> used as an end-marker for the edge path. */
+  endMarkerId: string;
+
+  /**
+   * Whether this edge is faded out. Used for fading out unused edges when
+   * displaying run statistics.
+   */
+  isFadedOut: boolean;
+
+  constructor(metaedge: Metaedge) {
+    this.metaedge = metaedge;
+    this.adjoiningMetaedge = null;
+    this.structural = false;
+    this.weight = 1;
+    this.isFadedOut = false;
+  }
+}
+
+function addInAnnotation(node: RenderNodeInfo, predecessor: Node,
+    predecessorRenderInfo: RenderNodeInfo,
+    edge: RenderMetaedgeInfo, type: AnnotationType): void {
+  let annotation = new Annotation(predecessor, predecessorRenderInfo, edge,
+      type, true);
+  node.inAnnotations.push(annotation);
+}
+
+function addOutAnnotation(node: RenderNodeInfo, successor: Node,
+    successorRenderInfo: RenderNodeInfo, edge: RenderMetaedgeInfo,
+    type: AnnotationType): void {
+  let annotation = new Annotation(successor, successorRenderInfo, edge,
+      type, false);
+  node.outAnnotations.push(annotation);
+}
+
+function setGraphDepth(graph: graphlib.Graph<RenderNodeInfo, any>,
+    depth: number) {
+  _.each(graph.nodes(), nodeName => {
+    let child = graph.node(nodeName);
+    child.expanded = depth > 1; // set all child of depth 1 to collapsed
+    if (depth > 0) {
+      switch (child.node.type) {
+        case NodeType.META:
+        case NodeType.SERIES:
+          setGroupNodeDepth(<RenderGroupNodeInfo>child, depth - 1);
+          break;
+        // Do nothing for leaf
+      }
+    }
+  });
+};
+
+export class RenderGroupNodeInfo extends RenderNodeInfo {
+  node: GroupNode;
+  /**
+   * The core graph is derived from the underlying node's metagraph, minus
+   * the extracted source-like and sink-like nodes.
+   */
+  coreGraph: graphlib.Graph<RenderNodeInfo, RenderMetaedgeInfo>;
+  /** Size of the bounding box for a metanode's isolated in-extract children. */
+  inExtractBox: {width: number, height: number};
+  /**
+   * Size of the bounding box for a metanode's isolated out-extract children.
+   */
+  outExtractBox: {width: number, height: number};
+  /** Array of isolated in-extract nodes. */
+  isolatedInExtract: RenderNodeInfo[];
+  /** Array of isolated out-extract nodes. */
+  isolatedOutExtract: RenderNodeInfo[];
+
+  constructor(groupNode: GroupNode) {
+    super(groupNode);
+    let metagraph = groupNode.metagraph;
+    let gl = metagraph.graph();
+    this.coreGraph =
+        createGraph<RenderNodeInfo, RenderMetaedgeInfo>(
+            gl.name, GraphType.CORE, { compound: true });
+    this.inExtractBox = {width: 0, height: 0};
+    this.outExtractBox = {width: 0, height: 0};
+    this.isolatedInExtract = [];
+    this.isolatedOutExtract = [];
+  }
+}
+
+function setGroupNodeDepth(renderInfo: RenderGroupNodeInfo,
+    depth: number): void {
+  if (renderInfo.coreGraph) {
+    setGraphDepth(renderInfo.coreGraph, depth);
+  }
+}
+
+/**
+ * Remove an edge from the graph and add annotations to both ends of the edge.
+ *
+ * @param The core graph.
+ * @param v Source name.
+ * @param w Sink name.
+ */
+function createShortcut(
+    graph: graphlib.Graph<RenderNodeInfo, RenderMetaedgeInfo>,
+    v: string, w: string) {
+  let src = graph.node(v);
+  let sink = graph.node(w);
+  let edge = graph.edge(v, w);
+
+  // If either of the nodes is explicitly included in the main graph and
+  // both nodes are in the main graph then do not create the shortcut
+  // and instead keep the real edge.
+  if ((src.node.include === InclusionType.INCLUDE ||
+       sink.node.include === InclusionType.INCLUDE) &&
+      src.node.include !== InclusionType.EXCLUDE &&
+      sink.node.include !== InclusionType.EXCLUDE) {
+    return;
+  }
+
+  // Add each annotation.
+  addOutAnnotation(src, sink.node, sink, edge, AnnotationType.SHORTCUT);
+  addInAnnotation(sink, src.node, src, edge, AnnotationType.SHORTCUT);
+
+  // Remove the edge from the core graph.
+  graph.removeEdge(v, w);
+}
+
+/**
+ * Remove edges from a node, and set its isOutExtract property to true,
+ * and remove the node and move it to isolatedOutExtract.
+ *
+ * If detachAllEdgesForHighDegree or forceDetach is true, extract all of its
+ * edges. Otherwise, only extract all in-edges.
+ */
+function makeOutExtract(renderNode: RenderGroupNodeInfo, n: string,
+    forceDetach?: boolean) {
+  let graph = renderNode.coreGraph;
+  let child = graph.node(n);
+  child.isOutExtract = true;
+
+  _.each(graph.predecessors(n), (p, index) => {
+    createShortcut(graph, p, n);
+  });
+
+  if (PARAMS.detachAllEdgesForHighDegree || forceDetach) {
+    _.each(graph.successors(n), (s, index) => {
+      createShortcut(graph, n, s);
+    });
+  }
+
+  // Remove the node from the core graph if it no longer has neighbors.
+  if (graph.neighbors(n).length === 0) {
+    child.node.include = InclusionType.EXCLUDE;
+    renderNode.isolatedOutExtract.push(child);
+    graph.removeNode(n);
+  }
+}
+
+/**
+ * Remove edges from a node, set its isInExtract property to true,
+ * and remove the node and move it to isolatedInExtract.
+ *
+ * If detachAllEdgesForHighDegree or forceDetach is true, extract all of its
+ * edges. Otherwise, only remove all out-edges.
+ */
+export function makeInExtract(renderNode: RenderGroupNodeInfo, n: string,
+    forceDetach?: boolean) {
+  let graph = renderNode.coreGraph;
+  let child = graph.node(n);
+  child.isInExtract = true;
+
+  _.each(graph.successors(n), (s, index) => {
+    createShortcut(graph, n, s);
+  });
+
+  if (PARAMS.detachAllEdgesForHighDegree || forceDetach) {
+    _.each(graph.predecessors(n), (p, index) => {
+      createShortcut(graph, p, n);
+    });
+  }
+
+  // Remove the node from the core graph if it no longer has neighbors.
+  if (graph.neighbors(n).length === 0) {
+    child.node.include = InclusionType.EXCLUDE;
+    renderNode.isolatedInExtract.push(child);
+    graph.removeNode(n);
+  }
+}
+
+/**
+ * Check whether the node's type is a member of the given list of types.
+ *
+ * @param node Node.
+ * @param types List of type to match.
+ */
+function hasTypeIn(node: Node, types: string[]): boolean {
+  if (node.type === NodeType.OP) {
+    for (let i = 0; i < types.length; i++) {
+      if ((<OpNode>node).op === types[i]) { return true; }
+    }
+  } else if (node.type === NodeType.META) {
+    let rootOpNode = (<Metanode>node).getRootOp();
+    if (rootOpNode) {
+      for (let i = 0; i < types.length; i++) {
+        if (rootOpNode.op === types[i]) { return true; }
+      }
+    }
+  }
+  return false;
+}
+
+/** Move nodes that are specified to be excluded out of the core graph. */
+function extractSpecifiedNodes(renderNode: RenderGroupNodeInfo) {
+  let graph = renderNode.coreGraph;
+  _.each(graph.nodes(), n => {
+    let renderInfo = graph.node(n);
+    if (renderInfo.node.include === InclusionType.EXCLUDE) {
+      if (renderNode.coreGraph.outEdges(n).length >
+          renderNode.coreGraph.inEdges(n).length) {
+        makeOutExtract(renderNode, n, true);
+      } else {
+        makeInExtract(renderNode, n, true);
+      }
+    }
+  });
+}
+
+/** Remove edges from pre-defined out-extract patterns */
+function extractPredefinedSink(renderNode: RenderGroupNodeInfo) {
+  let graph = renderNode.coreGraph;
+  _.each(graph.nodes(), n => {
+    let renderInfo = graph.node(n);
+    if (renderInfo.node.include !== InclusionType.UNSPECIFIED) {
+      return;
+    }
+    if (hasTypeIn(renderInfo.node, PARAMS.outExtractTypes)) {
+      makeOutExtract(renderNode, n);
+    }
+  });
+}
+
+/** Remove edges from pre-defined in-extract patterns */
+function extractPredefinedSource(renderNode) {
+  let graph = renderNode.coreGraph;
+  _.each(graph.nodes(), n => {
+    let renderInfo = graph.node(n);
+    if (renderInfo.node.include !== InclusionType.UNSPECIFIED) {
+      return;
+    }
+    if (hasTypeIn(renderInfo.node, PARAMS.inExtractTypes)) {
+      makeInExtract(renderNode, n);
+    }
+  });
+}
+
+/** Extract nodes deemed to have either high in-degree or high out-degree. */
+function extractHighInOrOutDegree(renderNode: RenderGroupNodeInfo) {
+  let graph = renderNode.coreGraph;
+
+  // Create mappings from node to in and out degrees. Count the number of valid
+  // nodes along the way.
+  let nodeToInDegree = {};
+  let nodeToOutDegree = {};
+  let validNodeCount = 0;
+  _.each(graph.nodes(), currentNode => {
+    if (graph.node(currentNode).node.include !== InclusionType.UNSPECIFIED) {
+      // This node is not included in the first place.
+      return;
+    }
+
+    // Count the in and out degrees based on only regular edges, unless there
+    // are no regular edges, in which case use the number of control edges.
+    // This is done so that control edges don't affect if nodes are extracted
+    // from the core graph, unless the node is only used for control.
+    let inDegree =
+        _.reduce(graph.predecessors(currentNode), (inDegree, pred) => {
+          let metaedge = graph.edge(pred, currentNode).metaedge;
+          return inDegree + (metaedge.numRegularEdges ? 1 : 0);
+        }, 0);
+    if (inDegree === 0 && graph.predecessors(currentNode).length > 0) {
+      inDegree = graph.predecessors(currentNode).length;
+    }
+
+    let outDegree =
+        _.reduce(graph.successors(currentNode), (outDegree, succ) => {
+          let metaedge = graph.edge(currentNode, succ).metaedge;
+          return outDegree + (metaedge.numRegularEdges ? 1 : 0);
+        }, 0);
+    if (outDegree === 0 && graph.successors(currentNode).length > 0) {
+      outDegree = graph.successors(currentNode).length;
+    }
+
+    // Store the in and out degrees of this node to avoid recomputing.
+    nodeToInDegree[currentNode] = inDegree;
+    nodeToOutDegree[currentNode] = outDegree;
+    validNodeCount++;
+  });
+
+  if (validNodeCount < PARAMS.minNodeCountForExtraction) {
+    // This graph has few nodes. Do not extract any nodes.
+    return;
+  }
+
+  // We only extract if the node has a min in or out degree greater than this.
+  let minUpperBound = PARAMS.minDegreeForExtraction - 1;
+
+  // Mark for extraction nodes with in-degree > Q3 + (Q3 - Q1).
+  let q3Index = Math.round(validNodeCount * 0.75);
+  let q1Index = Math.round(validNodeCount * 0.25);
+  let sortedByInDegree = Object.keys(nodeToInDegree).sort((node0, node1) => {
+    return nodeToInDegree[node0] - nodeToInDegree[node1];
+  });
+  let inDegreeQ3 = nodeToInDegree[sortedByInDegree[q3Index]];
+  let inDegreeQ1 = nodeToInDegree[sortedByInDegree[q1Index]];
+  let inDegreeUpperBound = inDegreeQ3 + inDegreeQ3 - inDegreeQ1;
+  // Only extract if the upper bound is high enough.
+  inDegreeUpperBound = Math.max(inDegreeUpperBound, minUpperBound);
+  for (let i = validNodeCount - 1;
+       nodeToInDegree[sortedByInDegree[i]] > inDegreeUpperBound; i--) {
+    // Extract a high in-degree node.
+    makeInExtract(renderNode, sortedByInDegree[i]);
+  }
+
+  // Mark for extraction nodes with out-degree > Q3 + (Q3 - Q1) * 4.
+  let sortedByOutDegree = Object.keys(nodeToOutDegree).sort((node0, node1) => {
+    return nodeToOutDegree[node0] - nodeToOutDegree[node1];
+  });
+  let outDegreeQ3 = nodeToOutDegree[sortedByOutDegree[q3Index]];
+  let outDegreeQ1 = nodeToOutDegree[sortedByOutDegree[q1Index]];
+  // The upper bound for extracting out-degree nodes is higher than that for
+  // extracting in-degree ones (Note the "* 4") because, in practice, some
+  // graphs look worse with a smaller out-degree bound. For instance, a smaller
+  // out-degree bound removes the convolution nodes from cifar 10 train's graph.
+  let outDegreeUpperBound = outDegreeQ3 + (outDegreeQ3 - outDegreeQ1) * 4;
+  // Only extract if the upper bound is high enough.
+  outDegreeUpperBound = Math.max(outDegreeUpperBound, minUpperBound);
+  for (let i = validNodeCount - 1;
+       nodeToOutDegree[sortedByOutDegree[i]] > outDegreeUpperBound; i--) {
+    let node = graph.node(sortedByOutDegree[i]);
+    if (!node || node.isInExtract) {
+      // This node has already been extracted due to high in-degree. It might
+      // have been removed from the graph in general (during in-degree
+      // extraction) due to a lack of neighbors. Do not extract this node twice.
+      continue;
+    }
+
+    // Extract a high out-degree node that has not already been extracted.
+    makeOutExtract(renderNode, sortedByOutDegree[i]);
+  }
+}
+
+/** Remove control edges from nodes that have too many control edges */
+function removeControlEdges(renderNode: RenderGroupNodeInfo) {
+  let graph = renderNode.coreGraph;
+
+  // Collect control edges into a map by node name.
+  let map = <{[nodeName: string]: graphlib.EdgeObject[]}>{};
+  _.each(graph.edges(), e => {
+    if (!graph.edge(e).metaedge.numRegularEdges) {
+      (map[e.v] = map[e.v] || []).push(e);
+      (map[e.w] = map[e.w] || []).push(e);
+    }
+  });
+
+  // For each node with too many control edges, turn them into annotations.
+  _.each(map, (edges, nodeName) => {
+    if (edges.length > PARAMS.maxControlDegree) {
+      _.each(edges, e => createShortcut(graph, e.v, e.w));
+    }
+  });
+}
+
+/**
+ * Given an integer, picks a hue that is far apart from other colors.
+ * The formula for picking color that avoid collision is:
+ *     hue = (color range * golden ratio * index) % color range
+ */
+export function mapIndexToHue(id: number): number {
+  let GOLDEN_RATIO = 1.61803398875;
+  // Hue of 0 is reserved for the gray nodes.
+  let MIN_HUE = 1;
+  let MAX_HUE = 359;
+  let COLOR_RANGE = MAX_HUE - MIN_HUE;
+  return MIN_HUE + ((COLOR_RANGE * GOLDEN_RATIO * id) % COLOR_RANGE);
+};
+
+/**
+ * Remove edges and add to annotation instead.
+ *
+ * For root node, consider predefined types for source and sink.
+ * We do not extract predefined type from non-root so that Variables and the
+ * sgd node (op type = 'NoOp') do not get extract from inside own group.
+ *
+ * The order of extraction is important here as swapping the order can totally
+ * screw up the graph layout.
+ *
+ * @param {Render.Node} renderNode Node to manipulate.
+ */
+function extractHighDegrees(renderNode: RenderGroupNodeInfo) {
+
+  extractSpecifiedNodes(renderNode);
+
+  if (PARAMS.outExtractTypes) {
+    extractPredefinedSink(renderNode);
+  }
+
+  // This has to come before extract high in-degree to protect the core part
+  // that takes many variables.
+  if (PARAMS.inExtractTypes) {
+    extractPredefinedSource(renderNode);
+  }
+
+  extractHighInOrOutDegree(renderNode);
+
+  if (PARAMS.maxControlDegree) {
+    removeControlEdges(renderNode);
+  }
+
+  // Extract isolated nodes, which can be
+  // (1) source-like and sink-like nodes that are not originally isolated but
+  //     become isolated after further removal.
+  // (2) isolated nodes with annotations on one-side.  These might be either
+  //     - nodes that originally have high out-degree but because we remove
+  //       high in-degree nodes first, they no longer have high in-degree when
+  //       we check.  (Detecting all high-degree before removing also leads to
+  //       another problem.)
+  //     - nodes that do not have high degree, but their neighbors are all
+  //       extracted, so it might make sense to extract them too.
+
+  let graph = renderNode.coreGraph;
+  _.each(graph.nodes(), n => {
+    let child = graph.node(n);
+    let degree = graph.neighbors(n).length;
+    if (child.node.include !== InclusionType.UNSPECIFIED) {
+      return;
+    }
+    if (degree === 0) {
+      let hasOutAnnotations = child.outAnnotations.list.length > 0;
+      let hasInAnnotations = child.inAnnotations.list.length > 0;
+
+      if (child.isInExtract) { // Is source-like.
+        // This case only happens if detachAllEdgesForHighDegree is false.
+        // (Otherwise all source-like nodes are all isolated already.)
+        renderNode.isolatedInExtract.push(child);
+        child.node.include = InclusionType.EXCLUDE;
+        graph.removeNode(n);
+      } else if (child.isOutExtract) { // Is sink-like.
+        // This case only happens if detachAllEdgesForHighDegree is false.
+        // // (Otherwise all sink-like nodes are all isolated already.)
+        renderNode.isolatedOutExtract.push(child);
+        child.node.include = InclusionType.EXCLUDE;
+        graph.removeNode(n);
+      } else if (PARAMS.extractIsolatedNodesWithAnnotationsOnOneSide) {
+        if (hasOutAnnotations && !hasInAnnotations) {
+          child.isInExtract = true; // for ones with high out-annotations
+          renderNode.isolatedInExtract.push(child);
+          child.node.include = InclusionType.EXCLUDE;
+          graph.removeNode(n);
+        } else if (hasInAnnotations && !hasOutAnnotations) {
+          child.isOutExtract = true; // for ones with high in-annotations
+          renderNode.isolatedOutExtract.push(child);
+          child.node.include = InclusionType.EXCLUDE;
+          graph.removeNode(n);
+        } else {
+          // if a low degree node has both in- & out- annotations, do nothing
+          // because it is unclear which side it should go to.
+        }
+      }
+    }
+  });
+}
+} // close module tf.graph.render
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/scene.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/scene.ts
new file mode 100644
index 0000000000000000000000000000000000000000..29f9b446b3637c10221a6485a6e0a2402e42b95d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/scene.ts
@@ -0,0 +1,678 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.graph.scene {
+  const svgNamespace = 'http://www.w3.org/2000/svg';
+
+  /** Enums element class of objects in the scene */
+  export let Class = {
+    Node: {
+      // <g> element that contains nodes.
+      CONTAINER: 'nodes',
+      // <g> element that contains detail about a node.
+      GROUP: 'node',
+      // <g> element that contains visual elements (like rect, ellipse).
+      SHAPE: 'nodeshape',
+      // <*> element(s) under SHAPE that should receive color updates.
+      COLOR_TARGET: 'nodecolortarget',
+      // <text> element showing the node's label.
+      LABEL: 'nodelabel',
+      // <g> element that contains all visuals for the expand/collapse
+      // button for expandable group nodes.
+      BUTTON_CONTAINER: 'buttoncontainer',
+      // <circle> element that surrounds expand/collapse buttons.
+      BUTTON_CIRCLE: 'buttoncircle',
+      // <path> element of the expand button.
+      EXPAND_BUTTON: 'expandbutton',
+      // <path> element of the collapse button.
+      COLLAPSE_BUTTON: 'collapsebutton'
+    },
+    Edge: {
+      CONTAINER: 'edges',
+      GROUP: 'edge',
+      LINE: 'edgeline',
+      REF_LINE: 'refline',
+      STRUCTURAL: 'structural'
+    },
+    Annotation: {
+      OUTBOX: 'out-annotations',
+      INBOX: 'in-annotations',
+      GROUP: 'annotation',
+      NODE: 'annotation-node',
+      EDGE: 'annotation-edge',
+      CONTROL_EDGE: 'annotation-control-edge',
+      LABEL: 'annotation-label',
+      ELLIPSIS: 'annotation-ellipsis'
+    },
+    Scene: {
+      GROUP: 'scene',
+      CORE: 'core',
+      INEXTRACT: 'in-extract',
+      OUTEXTRACT: 'out-extract'
+    },
+    Subscene: {GROUP: 'subscene'},
+    OPNODE: 'op',
+    METANODE: 'meta',
+    SERIESNODE: 'series',
+    BRIDGENODE: 'bridge',
+    ELLIPSISNODE: 'ellipsis'
+  };
+
+  /**
+   * A health pill encapsulates an overview of tensor element values. The value
+   * field is a list of 12 numbers that shed light on the status of the tensor.
+   * Visualized in health pills are the 3rd through 8th (inclusive) numbers of
+   * health pill values. Those 6 numbers are counts of tensor elements that fall
+   * under -Inf, negative, 0, positive, +Inf, NaN (in that order).
+   *
+   * Please keep this interface consistent with HealthPillDatum within
+   * backend.ts.
+   */
+  export interface HealthPill {
+    node_name: string;
+    output_slot: number;
+    value: number[];
+    wall_time: number;
+    step: number;
+  }
+  ;
+
+  /**
+   * Encapsulates how to render a single entry in a health pill. Each entry
+   * corresponds to a category of tensor element values.
+   */
+  export interface HealthPillEntry {
+    background_color: string;
+    label: string;
+  }
+  ;
+  export let healthPillEntries: HealthPillEntry[] = [
+    {
+      background_color: '#CC2F2C',
+      label: 'NaN',
+    },
+    {
+      background_color: '#FF8D00',
+      label: '- ∞',
+    },
+    {
+      background_color: '#EAEAEA',
+      label: '-',
+    },
+    {
+      background_color: '#A5A5A5',
+      label: '0',
+    },
+    {
+      background_color: '#262626',
+      label: '+',
+    },
+    {
+      background_color: '#003ED4',
+      label: '+ ∞',
+    },
+  ];
+
+  /**
+   * Helper method for fitting the graph in the svg view.
+   *
+   * @param svg The main svg.
+   * @param zoomG The svg group used for panning and zooming.
+   * @param d3zoom The zoom behavior.
+   * @param callback Called when the fitting is done.
+   */
+  export function fit(svg, zoomG, d3zoom, callback) {
+    let svgRect = svg.getBoundingClientRect();
+    let sceneSize = null;
+    try {
+      sceneSize = zoomG.getBBox();
+      if (sceneSize.width === 0) {
+        // There is no scene anymore. We have been detached from the dom.
+        return;
+      }
+    } catch (e) {
+      // Firefox produced NS_ERROR_FAILURE if we have been
+      // detached from the dom.
+      return;
+    }
+    let scale = 0.9 *
+        Math.min(
+            svgRect.width / sceneSize.width, svgRect.height / sceneSize.height,
+            2);
+    let params = layout.PARAMS.graph;
+    const transform = d3.zoomIdentity
+        .scale(scale)
+        .translate(params.padding.paddingLeft, params.padding.paddingTop);
+
+    d3.select(svg)
+        .transition()
+        .duration(500)
+        .call(d3zoom.transform, transform)
+        .on('end.fitted', () => {
+          // Remove the listener for the zoomend event,
+          // so we don't get called at the end of regular zoom events,
+          // just those that fit the graph to screen.
+          d3zoom.on('end.fitted', null);
+          callback();
+        });
+};
+
+/**
+ * Helper method for panning the graph to center on the provided node,
+ * if the node is currently off-screen.
+ *
+ * @param nodeName The node to center the graph on
+ * @param svg The root SVG element for the graph
+ * @param zoomG The svg group used for panning and zooming.
+ * @param d3zoom The zoom behavior.
+ * @return True if the graph had to be panned to display the
+ *            provided node.
+ */
+export function panToNode(nodeName: String, svg, zoomG, d3zoom): boolean {
+  let node = <SVGAElement>d3
+                 .select('[data-name="' + nodeName + '"].' + Class.Node.GROUP)
+                 .node();
+  if (!node) {
+    return false;
+  }
+  let translate = d3zoom.translate();
+  // Check if the selected node is off-screen in either
+  // X or Y dimension in either direction.
+  let nodeBox = node.getBBox();
+  let nodeCtm = node.getScreenCTM();
+  let pointTL = svg.createSVGPoint();
+  let pointBR = svg.createSVGPoint();
+  pointTL.x = nodeBox.x;
+  pointTL.y = nodeBox.y;
+  pointBR.x = nodeBox.x + nodeBox.width;
+  pointBR.y = nodeBox.y + nodeBox.height;
+  pointTL = pointTL.matrixTransform(nodeCtm);
+  pointBR = pointBR.matrixTransform(nodeCtm);
+  let isOutsideOfBounds = (start, end, bound) => {
+    return end < 0 || start > bound;
+  };
+  let svgRect = svg.getBoundingClientRect();
+  if (isOutsideOfBounds(pointTL.x, pointBR.x, svgRect.width) ||
+      isOutsideOfBounds(pointTL.y, pointBR.y, svgRect.height)) {
+    // Determine the amount to transform the graph in both X and Y
+    // dimensions in order to center the selected node. This takes into
+    // acount the position of the node, the size of the svg scene, the
+    // amount the scene has been scaled by through zooming, and any previous
+    // transform already performed by this logic.
+    let centerX = (pointTL.x + pointBR.x) / 2;
+    let centerY = (pointTL.y + pointBR.y) / 2;
+    let dx = ((svgRect.width / 2) - centerX);
+    let dy = ((svgRect.height / 2) - centerY);
+    let zoomEvent = d3zoom.translate([translate[0] + dx, translate[1] + dy])
+        .event;
+    d3.select(zoomG).transition().duration(500).call(zoomEvent);
+    return true;
+  }
+  return false;
+};
+
+/**
+ * Given a container d3 selection, select a child svg element of a given tag
+ * and class if exists or append / insert one otherwise.  If multiple children
+ * matches the tag and class name, returns only the first one.
+ *
+ * @param container
+ * @param tagName tag name.
+ * @param className (optional) Class name or a list of class names.
+ * @param before (optional) reference DOM node for insertion.
+ * @return selection of the element
+ */
+export function selectOrCreateChild(
+    container, tagName: string, className?: string | string[], before?) {
+  let child = selectChild(container, tagName, className);
+  if (!child.empty()) {
+    return child;
+  }
+  let newElement =
+      document.createElementNS('http://www.w3.org/2000/svg', tagName);
+
+  if (className instanceof Array) {
+    for (let i = 0; i < className.length; i++) {
+      newElement.classList.add(className[i]);
+    }
+  } else {
+    newElement.classList.add(className);
+  }
+
+  if (before) { // if before exists, insert
+    container.node().insertBefore(newElement, before);
+  } else { // otherwise, append
+    container.node().appendChild(newElement);
+  }
+  return d3.select(newElement)
+           // need to bind data to emulate d3_selection.append
+           .datum(container.datum());
+};
+
+/**
+ * Given a container d3 selection, select a child element of a given tag and
+ * class. If multiple children matches the tag and class name, returns only
+ * the first one.
+ *
+ * @param container
+ * @param tagName tag name.
+ * @param className (optional) Class name or list of class names.
+ * @return selection of the element, or an empty selection
+ */
+export function selectChild(
+    container, tagName: string, className?: string | string[]) {
+  let children = container.node().childNodes;
+  for (let i = 0; i < children.length; i++) {
+    let child = children[i];
+    if (child.tagName === tagName) {
+      if (className instanceof Array) {
+        let hasAllClasses = true;
+        for (let j = 0; j < className.length; j++) {
+          hasAllClasses =
+              hasAllClasses && child.classList.contains(className[j]);
+        }
+        if (hasAllClasses) {
+          return d3.select(child);
+        }
+      } else if ((!className || child.classList.contains(className))) {
+        return d3.select(child);
+      }
+    }
+  }
+  return d3.select(null);
+};
+
+/**
+ * Select or create a sceneGroup and build/update its nodes and edges.
+ *
+ * Structure Pattern:
+ *
+ * <g class='scene'>
+ *   <g class='core'>
+ *     <g class='edges'>
+ *       ... stuff from tf.graph.scene.edges.build ...
+ *     </g>
+ *     <g class='nodes'>
+ *       ... stuff from tf.graph.scene.nodes.build ...
+ *     </g>
+ *   </g>
+ *   <g class='in-extract'>
+ *     <g class='nodes'>
+ *       ... stuff from tf.graph.scene.nodes.build ...
+ *     </g>
+ *   </g>
+ *   <g class='out-extract'>
+ *     <g class='nodes'>
+ *       ... stuff from tf.graph.scene.nodes.build ...
+ *     </g>
+ *   </g>
+ * </g>
+ *
+ * @param container D3 selection of the parent.
+ * @param renderNode render node of a metanode or series node.
+ * @param sceneElement <tf-graph-scene> polymer element.
+ * @param sceneClass class attribute of the scene (default='scene').
+ */
+export function buildGroup(container,
+    renderNode: render.RenderGroupNodeInfo,
+    sceneElement,
+    sceneClass: string) {
+  sceneClass = sceneClass || Class.Scene.GROUP;
+  let isNewSceneGroup = selectChild(container, 'g', sceneClass).empty();
+  let sceneGroup = selectOrCreateChild(container, 'g', sceneClass);
+
+  // core
+  let coreGroup = selectOrCreateChild(sceneGroup, 'g', Class.Scene.CORE);
+  let coreNodes = _.reduce(renderNode.coreGraph.nodes(), (nodes, name) => {
+                    let node = renderNode.coreGraph.node(name);
+                    if (!node.excluded) {
+                      nodes.push(node);
+                    }
+                    return nodes;
+                  }, []);
+
+  if (renderNode.node.type === NodeType.SERIES) {
+    // For series, we want the first item on top, so reverse the array so
+    // the first item in the series becomes last item in the top, and thus
+    // is rendered on the top.
+    coreNodes.reverse();
+  }
+
+  // Create the layer of edges for this scene (paths).
+  edge.buildGroup(coreGroup, renderNode.coreGraph, sceneElement);
+
+  // Create the layer of nodes for this scene (ellipses, rects etc).
+  node.buildGroup(coreGroup, coreNodes, sceneElement);
+
+  // In-extract
+  if (renderNode.isolatedInExtract.length > 0) {
+    let inExtractGroup =
+        selectOrCreateChild(sceneGroup, 'g', Class.Scene.INEXTRACT);
+    node.buildGroup(inExtractGroup, renderNode.isolatedInExtract,
+        sceneElement);
+  } else {
+    selectChild(sceneGroup, 'g', Class.Scene.INEXTRACT).remove();
+  }
+
+  // Out-extract
+  if (renderNode.isolatedOutExtract.length > 0) {
+    let outExtractGroup =
+        selectOrCreateChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT);
+    node.buildGroup(outExtractGroup, renderNode.isolatedOutExtract,
+        sceneElement);
+  } else {
+    selectChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT).remove();
+  }
+
+  position(sceneGroup, renderNode);
+
+  // Fade in the scene group if it didn't already exist.
+  if (isNewSceneGroup) {
+    sceneGroup.attr('opacity', 0).transition().attr('opacity', 1);
+  }
+
+  return sceneGroup;
+};
+
+/**
+ * Given a scene's svg group, set  g.in-extract, g.coreGraph, g.out-extract svg
+ * groups' position relative to the scene.
+ *
+ * @param sceneGroup
+ * @param renderNode render node of a metanode or series node.
+ */
+function position(sceneGroup, renderNode: render.RenderGroupNodeInfo) {
+  // Translate scenes down by the label height so that when showing graphs in
+  // expanded metanodes, the graphs are below the labels.  Do not shift them
+  // down for series nodes as series nodes don't have labels inside of their
+  // bounding boxes.
+  let yTranslate = renderNode.node.type === NodeType.SERIES ?
+    0 : layout.PARAMS.subscene.meta.labelHeight;
+
+  // core
+  translate(selectChild(sceneGroup, 'g', Class.Scene.CORE), 0, yTranslate);
+
+  // in-extract
+  let hasInExtract = renderNode.isolatedInExtract.length > 0;
+  let hasOutExtract = renderNode.isolatedOutExtract.length > 0;
+
+  if (hasInExtract) {
+    let offset = layout.PARAMS.subscene.meta.extractXOffset;
+    let inExtractX = renderNode.coreBox.width -
+      renderNode.inExtractBox.width / 2 - renderNode.outExtractBox.width -
+          (hasOutExtract ? offset : 0);
+    translate(
+        selectChild(sceneGroup, 'g', Class.Scene.INEXTRACT), inExtractX,
+        yTranslate);
+  }
+
+  // out-extract
+  if (hasOutExtract) {
+    let outExtractX = renderNode.coreBox.width -
+      renderNode.outExtractBox.width / 2;
+    translate(
+        selectChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT), outExtractX,
+        yTranslate);
+  }
+};
+
+/** Adds a click listener to a group that fires a graph-select event */
+export function addGraphClickListener(graphGroup, sceneElement) {
+  d3.select(graphGroup).on('click', () => {
+    sceneElement.fire('graph-select');
+  });
+};
+
+/** Helper for adding transform: translate(x0, y0) */
+export function translate(selection, x0: number, y0: number) {
+  // If it is already placed on the screen, make it a transition.
+  if (selection.attr('transform') != null) {
+    selection = selection.transition('position');
+  }
+  selection.attr('transform', 'translate(' + x0 + ',' + y0 + ')');
+};
+
+/**
+ * Helper for setting position of a svg rect
+ * @param rect rect to set position of.
+ * @param cx Center x.
+ * @param cy Center x.
+ * @param width Width to set.
+ * @param height Height to set.
+ */
+export function positionRect(rect, cx: number, cy: number, width: number,
+    height: number) {
+  rect.transition()
+    .attr('x', cx - width / 2)
+    .attr('y', cy - height / 2)
+    .attr('width', width)
+    .attr('height', height);
+};
+
+/**
+ * Helper for setting position of a svg expand/collapse button
+ * @param button container group
+ * @param renderNode the render node of the group node to position
+ *        the button on.
+ */
+export function positionButton(button, renderNode: render.RenderNodeInfo) {
+  let cx = layout.computeCXPositionOfNodeShape(renderNode);
+  // Position the button in the top-right corner of the group node,
+  // with space given the draw the button inside of the corner.
+  let width = renderNode.expanded ?
+      renderNode.width : renderNode.coreBox.width;
+  let height = renderNode.expanded ?
+      renderNode.height : renderNode.coreBox.height;
+  let x = cx + width / 2 - 6;
+  let y = renderNode.y - height / 2 + 6;
+  // For unexpanded series nodes, the button has special placement due
+  // to the unique visuals of this group node.
+  if (renderNode.node.type === NodeType.SERIES && !renderNode.expanded) {
+    x += 10;
+    y -= 2;
+  }
+  let translateStr = 'translate(' + x + ',' + y + ')';
+  button.selectAll('path').transition().attr('transform', translateStr);
+  button.select('circle').transition().attr(
+      {cx: x, cy: y, r: layout.PARAMS.nodeSize.meta.expandButtonRadius});
+};
+
+/**
+ * Helper for setting position of a svg ellipse
+ * @param ellipse ellipse to set position of.
+ * @param cx Center x.
+ * @param cy Center x.
+ * @param width Width to set.
+ * @param height Height to set.
+ */
+export function positionEllipse(ellipse, cx: number, cy: number,
+    width: number, height: number) {
+  ellipse.transition()
+    .attr('cx', cx)
+    .attr('cy', cy)
+    .attr('rx', width / 2)
+    .attr('ry', height / 2);
+};
+
+/**
+ * @param {number} stat A stat for a health pill (such as mean or variance).
+ * @param {boolean} shouldRoundOnesDigit Whether to round this number to the
+ *     ones digit. Useful for say int, uint, and bool output types.
+ * @return {string} A human-friendly string representation of that stat.
+ */
+export function humanizeHealthPillStat(stat, shouldRoundOnesDigit) {
+  if (shouldRoundOnesDigit) {
+    return stat.toFixed(0);
+  }
+
+  if (Math.abs(stat) >= 1) {
+    return stat.toFixed(1);
+  }
+  return stat.toExponential(1);
+}
+
+/**
+ * Renders a health pill for an op atop a node.
+ */
+function _addHealthPill(
+    nodeGroupElement: SVGElement, healthPill: HealthPill,
+    nodeInfo: render.RenderNodeInfo) {
+  // Check if text already exists at location.
+  d3.select(nodeGroupElement.parentNode as any).selectAll('.health-pill').remove();
+
+  if (!nodeInfo || !healthPill) {
+    return;
+  }
+
+  let lastHealthPillData = healthPill.value;
+
+  // For now, we only visualize the 6 values that summarize counts of tensor
+  // elements of various categories: -Inf, negative, 0, positive, Inf, and NaN.
+  let lastHealthPillOverview = lastHealthPillData.slice(2, 8);
+  let totalCount = lastHealthPillData[1];
+
+  let healthPillWidth = 60;
+  let healthPillHeight = 10;
+  if (nodeInfo.node.type === tf.graph.NodeType.OP) {
+    // Use a smaller health pill for op nodes (rendered as smaller ellipses).
+    healthPillWidth /= 2;
+    healthPillHeight /= 2;
+  }
+
+  let healthPillGroup = document.createElementNS(svgNamespace, 'g');
+  healthPillGroup.classList.add('health-pill');
+
+  // Define the gradient for the health pill.
+  let healthPillDefs = document.createElementNS(svgNamespace, 'defs');
+  healthPillGroup.appendChild(healthPillDefs);
+  let healthPillGradient =
+      document.createElementNS(svgNamespace, 'linearGradient');
+  const healthPillGradientId = 'health-pill-gradient';
+  healthPillGradient.setAttribute('id', healthPillGradientId);
+  let titleOnHoverTextEntries = [];
+  let cumulativeCount = 0;
+  let previousOffset = '0%';
+  for (let i = 0; i < lastHealthPillOverview.length; i++) {
+    if (!lastHealthPillOverview[i]) {
+      // Exclude empty categories.
+      continue;
+    }
+    cumulativeCount += lastHealthPillOverview[i];
+
+    // Create a color interval using 2 stop elements.
+    let stopElement0 = document.createElementNS(svgNamespace, 'stop');
+    stopElement0.setAttribute('offset', previousOffset);
+    stopElement0.setAttribute(
+        'stop-color', healthPillEntries[i].background_color);
+    healthPillGradient.appendChild(stopElement0);
+
+    let stopElement1 = document.createElementNS(svgNamespace, 'stop');
+    let percent = (cumulativeCount * 100 / totalCount) + '%';
+    stopElement1.setAttribute('offset', percent);
+    stopElement1.setAttribute(
+        'stop-color', healthPillEntries[i].background_color);
+    healthPillGradient.appendChild(stopElement1);
+    previousOffset = percent;
+
+    // Include this number in the title that appears on hover.
+    titleOnHoverTextEntries.push(
+        healthPillEntries[i].label + ': ' + lastHealthPillOverview[i]);
+  }
+  healthPillDefs.appendChild(healthPillGradient);
+
+  // Create the rectangle for the health pill.
+  let rect = document.createElementNS(svgNamespace, 'rect');
+  rect.setAttribute('fill', 'url(#' + healthPillGradientId + ')');
+  rect.setAttribute('width', String(healthPillWidth));
+  rect.setAttribute('height', String(healthPillHeight));
+  healthPillGroup.appendChild(rect);
+
+  // Show a title with specific counts on hover.
+  let titleSvg = document.createElementNS(svgNamespace, 'title');
+  titleSvg.textContent = titleOnHoverTextEntries.join(', ');
+  healthPillGroup.appendChild(titleSvg);
+
+  // Center this health pill just right above the node for the op.
+  let healthPillX = nodeInfo.x - healthPillWidth / 2;
+  let healthPillY = nodeInfo.y - healthPillHeight - nodeInfo.height / 2 - 2;
+  if (nodeInfo.labelOffset < 0) {
+    // The label is positioned above the node. Do not occlude the label.
+    healthPillY += nodeInfo.labelOffset;
+  }
+
+  if (lastHealthPillOverview[2] || lastHealthPillOverview[3] ||
+      lastHealthPillOverview[4]) {
+    // At least 1 "non-Inf and non-NaN" value exists (a -, 0, or + value). Show
+    // stats on tensor values.
+
+    // Determine if we should display the output range as integers.
+    let shouldRoundOnesDigit = false;
+    let node = nodeInfo.node as OpNode;
+    let attributes = node.attr;
+    if (attributes && attributes.length) {
+      // Find the attribute for output type if there is one.
+      for (let i = 0; i < attributes.length; i++) {
+        if (attributes[i].key === 'T') {
+          // Note whether the output type is an integer.
+          let outputType = attributes[i].value['type'];
+          shouldRoundOnesDigit =
+              outputType && /^DT_(BOOL|INT|UINT)/.test(outputType);
+          break;
+        }
+      }
+    }
+
+    let statsSvg = document.createElementNS(svgNamespace, 'text');
+    const minString =
+        humanizeHealthPillStat(lastHealthPillData[8], shouldRoundOnesDigit);
+    const maxString =
+        humanizeHealthPillStat(lastHealthPillData[9], shouldRoundOnesDigit);
+    statsSvg.textContent = minString + ' ~ ' + maxString;
+    statsSvg.classList.add('health-pill-stats');
+    statsSvg.setAttribute('x', String(healthPillWidth / 2));
+    statsSvg.setAttribute('y', '-2');
+    healthPillGroup.appendChild(statsSvg);
+  }
+
+  healthPillGroup.setAttribute(
+      'transform', 'translate(' + healthPillX + ', ' + healthPillY + ')');
+
+  Polymer.dom(nodeGroupElement.parentNode).appendChild(healthPillGroup);
+}
+
+/**
+ * Adds health pills (which visualize tensor summaries) to a graph group.
+ * @param svgRoot The root SVG element of the graph to add heath pills to.
+ * @param nodeNamesToHealthPills An object mapping node name to health pill.
+ * @param colors A list of colors to use.
+ */
+export function addHealthPills(
+    svgRoot: SVGElement, nodeNamesToHealthPills: {[key: string]: HealthPill[]},
+    healthPillStepIndex: number) {
+  if (!nodeNamesToHealthPills) {
+    // No health pill information available.
+    return;
+  }
+
+  let svgRootSelection = d3.select(svgRoot);
+  svgRootSelection.selectAll('g.nodeshape')
+      .each(function(nodeInfo: render.RenderNodeInfo) {
+        // Only show health pill data for this node if it is available.
+        let healthPills = nodeNamesToHealthPills[nodeInfo.node.name];
+        let healthPill = healthPills ? healthPills[healthPillStepIndex] : null;
+        _addHealthPill((this as SVGElement), healthPill, nodeInfo);
+      });
+};
+
+} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/template.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/template.ts
new file mode 100644
index 0000000000000000000000000000000000000000..7800d46029b7672c9c32debe36383be25e374c96
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/template.ts
@@ -0,0 +1,305 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.graph.template {
+
+/**
+ * Detect repeating patterns of subgraphs.
+ * Assign templateId to each subgraph if it belongs to a template.
+ * Returns clusters of similar subgraphs .
+ *
+ * @param graph
+ * @param verifyTemplate whether to run the template verification algorithm
+ * @return a dict (template id => Array of node names)
+ */
+export function detect(h, verifyTemplate): {[templateId: string]: string[]} {
+  // In any particular subgraph, there are either
+  // - leaf nodes (which do not have subgraph)
+  // - metanode nodes - some of them have only one member (singular metanode)
+  //                    and some have multiple members (non-singular metanode)
+
+  // First, generate a nearest neighbor hash of metanode nodes.
+  let nnGroups = clusterSimilarSubgraphs(h);
+
+  // For each metanode, compare its subgraph (starting from shallower groups)
+  // and assign template id.
+  let templates = groupTemplateAndAssignId(nnGroups, verifyTemplate);
+
+  // Sort the templates by minimum level in the graph at which they appear,
+  // as this leads to optimal setting of the colors of each template for
+  // maximum differentiation.
+  return <{[templateId: string]: string[]}>_(templates)
+      .pairs()
+      .sortBy(function(pair: {level: number, nodes: string[]}[]) {
+        return pair[1].level;
+      })
+      .map(function(pair: {level: number, nodes: string[]}[]) {
+        return [pair[0], pair[1].nodes];
+      })
+      .object()
+      .value();
+};
+
+/**
+ * @return Unique string for a metanode based on depth, |V|, |E| and
+ * op type histogram.
+ */
+function getSignature(metanode) {
+  // depth=<number> |V|=<number> |E|=<number>
+  let props = _.map(
+                   {
+                     'depth': metanode.depth,
+                     '|V|': metanode.metagraph.nodes().length,
+                     '|E|': metanode.metagraph.edges().length
+                   },
+                   function(v, k) { return k + '=' + v; })
+                  .join(' ');
+
+  // optype1=count1,optype2=count2
+  let ops = _.map(metanode.opHistogram, function(count, op) {
+               return op + '=' + count;
+             }).join(',');
+
+  return props + ' [ops] ' + ops;
+}
+
+/**
+ * Generate a nearest neighbor hash of metanodes
+ * based on depth, |V|, |E|, and opHistogram of their subgraph
+ * (excluding leaf nodes and singular metanodes).
+ * @param graph The graph
+ * @return Array of pairs of [signature,
+ *   Object with min level of the template and an Array of tf.graph.Group]
+ *   sort by ascending order of minimum depth at which metanode appears.
+ */
+function clusterSimilarSubgraphs(h: hierarchy.Hierarchy) {
+  /** a dict from metanode.signature() => Array of tf.graph.Groups */
+  let hashDict = _(h.getNodeMap()).reduce(
+      (hash, node: OpNode|Metanode, name) => {
+    if (node.type !== NodeType.META) {
+        return hash;
+    }
+    let levelOfMetaNode = name.split('/').length - 1;
+    let signature = getSignature(node);
+    let templateInfo = hash[signature] ||
+      {nodes: [], level: levelOfMetaNode};
+    hash[signature] = templateInfo;
+    templateInfo.nodes.push(node);
+    if (templateInfo.level > levelOfMetaNode) {
+      templateInfo.level = levelOfMetaNode;
+    }
+    return hash;
+  }, {});
+
+  return _(hashDict)
+      .pairs()
+      // filter nn metanode with only one member
+      .filter(function(pair: {level: number, nodes: string[]}) {
+        return pair[1].nodes.length > 1;
+      })
+      .sortBy(function(pair: {level: number, nodes: string[]}) {
+        // sort by depth
+        // (all members in the same nnGroup has equal depth)
+        return pair[1].nodes[0].depth;
+      })
+      .value();
+}
+
+function groupTemplateAndAssignId(nnGroups, verifyTemplate) {
+  // For each metanode, compare its subgraph (starting from shallower groups)
+  // and assign template id.
+  let result: {[templateId: string]: {level: number, nodes: string[]}} = {};
+  return _.reduce(nnGroups, function(templates, nnGroupPair) {
+    let signature = nnGroupPair[0],
+      nnGroup = nnGroupPair[1].nodes,
+      clusters = [];
+
+    nnGroup.forEach(function(metanode) {
+      // check with each existing cluster
+      for (let i = 0; i < clusters.length; i++) {
+        let similar = !verifyTemplate ||
+                      isSimilarSubgraph(
+                        clusters[i].metanode.metagraph,
+                        metanode.metagraph
+                      );
+        // if similar, just add this metanode to the cluster
+        if (similar) {
+          // get template from the first one
+          metanode.templateId = clusters[i].metanode.templateId;
+          clusters[i].members.push(metanode.name);
+          return;
+        }
+      }
+      // otherwise create a new cluster with id 'signature [count] '
+      metanode.templateId = signature + '[' + clusters.length + ']';
+      clusters.push({
+        metanode: metanode,
+        members: [metanode.name]
+      });
+    });
+
+    clusters.forEach(function(c) {
+      templates[c.metanode.templateId] = {
+        level: nnGroupPair[1].level,
+        nodes: c.members
+      };
+    });
+    return templates;
+  }, result);
+}
+
+function sortNodes(names: string[],
+    graph: graphlib.Graph<Metanode|OpNode, Metaedge>, prefix: string) {
+  return _.sortByAll(names,
+    function(name) {
+      let node = graph.node(name);
+      return (<OpNode>node).op;
+    },
+    function(name) {
+      let node = graph.node(name);
+      return (<Metanode>node).templateId;
+    },
+    function(name) {
+      return graph.neighbors(name).length;
+    },
+    function(name) {
+      return graph.predecessors(name).length;
+    },
+    function(name) {
+      return graph.successors(name).length;
+    },
+    function(name) {
+      return name.substr(prefix.length);
+    });
+}
+
+function isSimilarSubgraph(g1: graphlib.Graph<any, any>,
+    g2: graphlib.Graph<any, any>) {
+  if (!tf.graph.hasSimilarDegreeSequence(g1, g2)) {
+      return false;
+  }
+
+  // if we want to skip, just return true here.
+  // return true;
+
+  // Verify sequence by running DFS
+  let g1prefix = g1.graph().name;
+  let g2prefix = g2.graph().name;
+
+  let visited1 = {};
+  let visited2 = {};
+  let stack = [];
+
+  /**
+   * push sources or successors into the stack
+   * if the visiting pattern has been similar.
+   */
+  function stackPushIfNotDifferent(n1, n2) {
+    let sub1 = n1.substr(g1prefix.length),
+      sub2 = n2.substr(g2prefix.length);
+
+    /* tslint:disable */
+    if (visited1[sub1] ^ visited2[sub1]) {
+      console.warn(
+          'different visit pattern', '[' + g1prefix + ']', sub1,
+          '[' + g2prefix + ']', sub2);
+      return true;
+    }
+    /* tslint:enable */
+    if (!visited1[sub1]) { // implied && !visited2[sub2]
+      visited1[sub1] = visited2[sub2] = true;
+      stack.push({n1: n1, n2: n2});
+    }
+
+    return false;
+  }
+
+  // check if have same # of sources then sort and push
+  let sources1 = g1.sources();
+  let sources2 = g2.sources();
+  if (sources1.length !== sources2.length) {
+    /* tslint:disable */
+    console.log('different source length');
+    /* tslint:enable */
+    return false;
+  }
+  sources1 = sortNodes(sources1, g1, g1prefix);
+  sources2 = sortNodes(sources2, g2, g2prefix);
+
+  for (let i = 0; i < sources1.length; i++) {
+    let different = stackPushIfNotDifferent(sources1[i], sources2[i]);
+    if (different) {
+        return false;
+    }
+  }
+
+  while (stack.length > 0) {
+    let cur = stack.pop();
+
+    // check node
+    let similar = isSimilarNode(g1.node(cur.n1), g2.node(cur.n2));
+    if (!similar) {
+        return false;
+    }
+
+    // check if have same # of successors then sort and push
+    let succ1 = g1.successors(cur.n1), succ2 = g2.successors(cur.n2);
+    if (succ1.length !== succ2.length) {
+      /* tslint:disable */
+      console.log('# of successors mismatch', succ1, succ2);
+      /* tslint:enable */
+      return false;
+    }
+    succ1 = sortNodes(succ1, g1, g1prefix);
+    succ2 = sortNodes(succ2, g2, g2prefix);
+
+    for (let j = 0; j < succ1.length; j++) {
+      let different = stackPushIfNotDifferent(succ1[j], succ2[j]);
+      if (different) {
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/**
+ * Returns if two nodes have identical structure.
+ */
+function isSimilarNode(n1: OpNode|Metanode|SeriesNode,
+    n2: OpNode|Metanode|SeriesNode): boolean {
+  if (n1.type === NodeType.META) {
+    // compare metanode
+    let metanode1 = <Metanode> n1;
+    let metanode2 = <Metanode> n2;
+    return metanode1.templateId && metanode2.templateId &&
+        metanode1.templateId === metanode2.templateId;
+  } else if (n1.type === NodeType.OP && n2.type === NodeType.OP) {
+    // compare leaf node
+    return (<OpNode>n1).op === (<OpNode>n2).op;
+  } else if (n1.type === NodeType.SERIES && n2.type === NodeType.SERIES) {
+    // compare series node sizes and operations
+    // (only need to check one op as all op nodes are identical in series)
+    let sn1 = <SeriesNode> n1;
+    let sn2 = <SeriesNode> n2;
+    let seriesnode1Count = sn1.metagraph.nodeCount();
+    return (seriesnode1Count === sn2.metagraph.nodeCount() &&
+      (seriesnode1Count === 0 ||
+      ((<OpNode>sn1.metagraph.node(sn1.metagraph.nodes()[0])).op ===
+          (<OpNode>sn2.metagraph.node(sn2.metagraph.nodes()[0])).op)));
+  }
+  return false;
+}
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/graph-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/graph-test.ts
new file mode 100644
index 0000000000000000000000000000000000000000..af3030197e0824aaa808a8ad5b77fadf0cc856f9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/graph-test.ts
@@ -0,0 +1,103 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+suite('graph', () => {
+  let assert = chai.assert;
+
+  test('graphlib exists', () => { assert.isTrue(graphlib != null); });
+
+  test('simple graph contruction', done => {
+    let pbtxt = tf.graph.test.util.stringToArrayBuffer(`
+      node {
+        name: "Q"
+        op: "Input"
+      }
+      node {
+        name: "W"
+        op: "Input"
+      }
+      node {
+        name: "X"
+        op: "MatMul"
+        input: "Q:2"
+        input: "W"
+      }`);
+    let statsPbtxt = tf.graph.test.util.stringToArrayBuffer(`step_stats {
+      dev_stats {
+        device: "cpu"
+        node_stats {
+          node_name: "Q"
+          all_start_micros: 10
+          all_end_rel_micros: 4
+        }
+        node_stats {
+          node_name: "Q"
+          all_start_micros: 12
+          all_end_rel_micros: 4
+        }
+      }
+    }`);
+
+    let buildParams: tf.graph.BuildParams = {
+      enableEmbedding: true,
+      inEmbeddingTypes: ['Const'],
+      outEmbeddingTypes: ['^[a-zA-Z]+Summary$'],
+      refEdges: {}
+    };
+    let dummyTracker =
+        tf.graph.util.getTracker({set: () => { return; }, progress: 0});
+    tf.graph.parser.parseGraphPbTxt(pbtxt).then(nodes => {
+      tf.graph.build(nodes, buildParams, dummyTracker)
+          .then((slimGraph: tf.graph.SlimGraph) => {
+            assert.isTrue(slimGraph.nodes['X'] != null);
+            assert.isTrue(slimGraph.nodes['W'] != null);
+            assert.isTrue(slimGraph.nodes['Q'] != null);
+
+            let firstInputOfX = slimGraph.nodes['X'].inputs[0];
+            assert.equal(firstInputOfX.name, 'Q');
+            assert.equal(firstInputOfX.outputTensorIndex, 2);
+
+            let secondInputOfX = slimGraph.nodes['X'].inputs[1];
+            assert.equal(secondInputOfX.name, 'W');
+            assert.equal(secondInputOfX.outputTensorIndex, 0);
+
+            tf.graph.parser.parseStatsPbTxt(statsPbtxt).then(stepStats => {
+              tf.graph.joinStatsInfoWithGraph(slimGraph, stepStats);
+              assert.equal(slimGraph.nodes['Q'].stats.getTotalMicros(), 6);
+              done();
+            });
+          });
+    });
+  });
+
+  test('health pill numbers round correctly', () => {
+    // Integers are rounded to the ones place.
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(42.0, true), '42');
+
+    // Numbers with magnitude >= 1 are rounded to the tenths place.
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(1, false), '1.0');
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(42.42, false), '42.4');
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(-42.42, false), '-42.4');
+
+    // Numbers with magnitude < 1 are written in scientific notation rounded to
+    // the tenths place.
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(0, false), '0.0e+0');
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(0.42, false), '4.2e-1');
+    assert.equal(
+        tf.graph.scene.humanizeHealthPillStat(-0.042, false), '-4.2e-2');
+  });
+
+  // TODO(bp): write tests.
+});
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/hierarchy-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/hierarchy-test.ts
new file mode 100644
index 0000000000000000000000000000000000000000..fa62ffe2c7048a50d51a57894976820a720d636b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/hierarchy-test.ts
@@ -0,0 +1,23 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+suite('graph', () => {
+  let assert = chai.assert;
+
+  test('graphlib exists', () => { assert.isTrue(graphlib != null); });
+
+  // TODO(bp): write tests.
+
+});
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/index.html b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..7564167129d67d4f0e2d8f14de11f780ba262d67
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/index.html
@@ -0,0 +1,34 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+<head>
+  <meta charset="utf-8">
+  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+  <script src="../../web-component-tester/browser.js"></script>
+  <link rel="import" href="../tf-graph-common.html">
+</head>
+<body>
+  <script src="parser-test.js"></script>
+  <script src="graph-test.js"></script>
+  <script src="hierarchy-test.js"></script>
+  <script src="layout-test.js"></script>
+  <script src="util-test.js"></script>
+  <script src="util.js"></script>
+</body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/layout-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/layout-test.ts
new file mode 100644
index 0000000000000000000000000000000000000000..b4884413c9d4f0b2e3d61d283736174f6549819b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/layout-test.ts
@@ -0,0 +1,23 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+suite('layout', () => {
+  let assert = chai.assert;
+
+  test('dagre exists', () => { assert.isTrue(dagre != null); });
+
+  // TODO(bp): write tests.
+
+});
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/parser-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/parser-test.ts
new file mode 100644
index 0000000000000000000000000000000000000000..7c73178c1ce34e327afe6847cc96ad3f5f702185
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/parser-test.ts
@@ -0,0 +1,83 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+suite('parser', () => {
+  let assert = chai.assert;
+
+  test('simple pbtxt', done => {
+    let pbtxt = tf.graph.test.util.stringToArrayBuffer(`node {
+       name: "Q"
+       op: "Input"
+     }
+     node {
+       name: "W"
+       op: "Input"
+     }
+     node {
+       name: "X"
+       op: "MatMul"
+       input: "Q"
+       input: "W"
+     }`);
+    tf.graph.parser.parseGraphPbTxt(pbtxt).then(nodes => {
+      assert.isTrue(nodes != null && nodes.length === 3);
+
+      assert.equal('Q', nodes[0].name);
+      assert.equal('Input', nodes[0].op);
+
+      assert.equal('W', nodes[1].name);
+      assert.equal('Input', nodes[1].op);
+
+      assert.equal('X', nodes[2].name);
+      assert.equal('MatMul', nodes[2].op);
+      assert.equal('Q', nodes[2].input[0]);
+      assert.equal('W', nodes[2].input[1]);
+
+      done();
+    });
+  });
+
+  test('stats pbtxt parsing', done => {
+    let statsPbtxt = tf.graph.test.util.stringToArrayBuffer(`step_stats {
+      dev_stats {
+        device: "cpu"
+        node_stats {
+          node_name: "Q"
+          all_start_micros: 10
+          all_end_rel_micros: 4
+        }
+        node_stats {
+          node_name: "Q"
+          all_start_micros: 12
+          all_end_rel_micros: 4
+        }
+      }
+    }`);
+    tf.graph.parser.parseStatsPbTxt(statsPbtxt).then(stepStats => {
+      assert.equal(stepStats.dev_stats.length, 1);
+      assert.equal(stepStats.dev_stats[0].device, 'cpu');
+      assert.equal(stepStats.dev_stats[0].node_stats.length, 2);
+      assert.equal(stepStats.dev_stats[0].node_stats[0].all_start_micros, 10);
+      assert.equal(stepStats.dev_stats[0].node_stats[1].node_name, 'Q');
+      assert.equal(stepStats.dev_stats[0].node_stats[1].all_end_rel_micros, 4);
+      done();
+    });
+  });
+
+  test('d3 exists', () => { assert.isTrue(d3 != null); });
+
+  // TODO(nsthorat): write tests.
+
+});
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util-test.ts
new file mode 100644
index 0000000000000000000000000000000000000000..4535d24888f0777c5bdfa40bd537ac885604a8d7
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util-test.ts
@@ -0,0 +1,56 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+suite('util', () => {
+  let assert = chai.assert;
+
+  test('remove common prefix', () => {
+
+    // Empty array.
+    let result = tf.graph.util.removeCommonPrefix([]);
+    assert.deepEqual(result, []);
+
+    // No common prefix.
+    result = tf.graph.util.removeCommonPrefix(['a', 'b', 'c']);
+    assert.deepEqual(result, ['a', 'b', 'c']);
+
+    // One of the elements is empty string.
+    result = tf.graph.util.removeCommonPrefix(['a/b', '', 'a/c']);
+    assert.deepEqual(result, ['a/b', '', 'a/c']);
+
+    // Only one string.
+    result = tf.graph.util.removeCommonPrefix(['a/b/c']);
+    assert.deepEqual(result, ['a/b/c']);
+
+    // `q/w/` is the common prefix. Expect `q/w/` to be removed.
+    result = tf.graph.util.removeCommonPrefix(['q/w/a', 'q/w/b', 'q/w/c/f']);
+    assert.deepEqual(result, ['a', 'b', 'c/f']);
+
+    // `q/w/` is the common prefix and also an element. Expect nothing to be
+    // removed since the common prefix is also an element in the array.
+    result = tf.graph.util.removeCommonPrefix(['q/w/', 'q/w/b', 'q/w/c/f']);
+    assert.deepEqual(result, ['q/w/', 'q/w/b', 'q/w/c/f']);
+  });
+
+  test('query params', () => {
+    // Starts with question mark.
+    let queryParams = tf.graph.util.getQueryParams('?foo=1&bar=2');
+    assert.deepEqual(queryParams, {'foo': '1', 'bar': '2'});
+
+    // No question mark.
+    queryParams = tf.graph.util.getQueryParams('foo=1&bar=2');
+    assert.deepEqual(queryParams, {'foo': '1', 'bar': '2'});
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util.ts
new file mode 100644
index 0000000000000000000000000000000000000000..bc73b735ed2bd6335c8f72d8903a118897dd1738
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util.ts
@@ -0,0 +1,31 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+/* tslint:disable:no-namespace */
+module tf.graph.test.util {
+  /**
+   * Converts a utf-8 string to an ArrayBuffer.
+   */
+  export function stringToArrayBuffer(str): ArrayBuffer {
+    let buf = new ArrayBuffer(str.length);
+    let bufView = new Uint8Array(buf);
+    for (let i = 0, strLen = str.length; i < strLen; i++) {
+      bufView[i] = str.charCodeAt(i);
+    }
+    return buf;
+  }
+
+}  // module
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/tf-graph-common.html b/tensorflow/tensorboard/components/tf_graph_common_d3v4/tf-graph-common.html
new file mode 100644
index 0000000000000000000000000000000000000000..a460072a38f3c0fcd868b70f8c2325320df95028
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/tf-graph-common.html
@@ -0,0 +1,38 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../tf-imports/dagre.html">
+<link rel="import" href="../tf-imports/graphlib.html">
+<link rel="import" href="../tf-imports/lodash.html">
+
+<script src="colors.js"></script>
+<script src="common.js"></script>
+<script src="externs.js"></script>
+<script src="graph.js"></script>
+<script src="hierarchy.js"></script>
+<script src="layout.js"></script>
+<script src="parser.js"></script>
+<script src="proto.js"></script>
+<script src="render.js"></script>
+<script src="annotation.js"></script>
+<script src="contextmenu.js"></script>
+<script src="edge.js"></script>
+<script src="node.js"></script>
+<script src="scene.js"></script>
+<script src="template.js"></script>
+<script src="util.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/util.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/util.ts
new file mode 100644
index 0000000000000000000000000000000000000000..7f4d329e795956b19f8d2ff869eb4d52717315c0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/util.ts
@@ -0,0 +1,291 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * @fileoverview Utility functions for the tensorflow graph visualizer.
+ */
+
+module tf.graph.util {
+  /**
+   * Recommended delay (ms) when running an expensive task asynchronously
+   * that gives enough time for the progress bar to update its UI.
+   */
+  const ASYNC_TASK_DELAY = 20;
+
+  export function time<T>(msg: string, task: () => T) {
+    let start = Date.now();
+    let result = task();
+    /* tslint:disable */
+    console.log(msg, ':', Date.now() - start, 'ms');
+    /* tslint:enable */
+    return result;
+  }
+
+  /**
+   * Creates a tracker that sets the progress property of the
+   * provided polymer component. The provided component must have
+   * a property called 'progress' that is not read-only. The progress
+   * property is an object with a numerical 'value' property and a
+   * string 'msg' property.
+   */
+  export function getTracker(polymerComponent: any) {
+    return {
+      setMessage: function(msg) {
+        polymerComponent.set(
+            'progress', {value: polymerComponent.progress.value, msg: msg});
+      },
+      updateProgress: function(value) {
+        polymerComponent.set('progress', {
+          value: polymerComponent.progress.value + value,
+          msg: polymerComponent.progress.msg
+        });
+      },
+      reportError: function(msg: string, err) {
+        // Log the stack trace in the console.
+        console.error(err.stack);
+        // And send a user-friendly message to the UI.
+        polymerComponent.set(
+            'progress',
+            {value: polymerComponent.progress.value, msg: msg, error: true});
+      },
+    };
+  }
+
+  /**
+   * Creates a tracker for a subtask given the parent tracker, the total
+   * progress
+   * of the subtask and the subtask message. The parent task should pass a
+   * subtracker to its subtasks. The subtask reports its own progress which
+   * becames relative to the main task.
+   */
+  export function getSubtaskTracker(
+      parentTracker: ProgressTracker, impactOnTotalProgress: number,
+      subtaskMsg: string): ProgressTracker {
+    return {
+      setMessage: function(progressMsg) {
+        // The parent should show a concatenation of its message along with
+        // its subtask tracker message.
+        parentTracker.setMessage(subtaskMsg + ': ' + progressMsg);
+      },
+      updateProgress: function(incrementValue) {
+        // Update the parent progress relative to the child progress.
+        // For example, if the sub-task progresses by 30%, and the impact on the
+        // total progress is 50%, then the task progresses by 30% * 50% = 15%.
+        parentTracker.updateProgress(
+            incrementValue * impactOnTotalProgress / 100);
+      },
+      reportError: function(msg: string, err: Error) {
+        // The parent should show a concatenation of its message along with
+        // its subtask error message.
+        parentTracker.reportError(subtaskMsg + ': ' + msg, err);
+      }
+    };
+  }
+
+  /**
+   * Runs an expensive task and return the result.
+   */
+  export function runTask<T>(
+      msg: string, incProgressValue: number, task: () => T,
+      tracker: ProgressTracker): T {
+    // Update the progress message to say the current running task.
+    tracker.setMessage(msg);
+    // Run the expensive task with a delay that gives enough time for the
+    // UI to update.
+    try {
+      let result = tf.graph.util.time(msg, task);
+      // Update the progress value.
+      tracker.updateProgress(incProgressValue);
+      // Return the result to be used by other tasks.
+      return result;
+    } catch (e) {
+      // Errors that happen inside asynchronous tasks are
+      // reported to the tracker using a user-friendly message.
+      tracker.reportError('Failed ' + msg, e);
+    }
+  }
+
+  /**
+   * Runs an expensive task asynchronously and returns a promise of the result.
+   */
+  export function runAsyncTask<T>(
+      msg: string, incProgressValue: number, task: () => T,
+      tracker: ProgressTracker): Promise<T> {
+    return new Promise((resolve, reject) => {
+      // Update the progress message to say the current running task.
+      tracker.setMessage(msg);
+      // Run the expensive task with a delay that gives enough time for the
+      // UI to update.
+      setTimeout(function() {
+        try {
+          let result = tf.graph.util.time(msg, task);
+          // Update the progress value.
+          tracker.updateProgress(incProgressValue);
+          // Return the result to be used by other tasks.
+          resolve(result);
+        } catch (e) {
+          // Errors that happen inside asynchronous tasks are
+          // reported to the tracker using a user-friendly message.
+          tracker.reportError('Failed ' + msg, e);
+        }
+      }, ASYNC_TASK_DELAY);
+    });
+  }
+
+  /**
+   * Asynchronously runs an expensive task that returns a promise. Updates the
+   * tracker's progress after the promise resolves. Returns a new promise that
+   * resolves after the progress is updated.
+   */
+  export function runAsyncPromiseTask<T>(
+      msg: string, incProgressValue: number, task: () => Promise<T>,
+      tracker: ProgressTracker): Promise<T> {
+    return new Promise((resolve, reject) => {
+      let handleError = function(e) {
+        // Errors that happen inside asynchronous tasks are
+        // reported to the tracker using a user-friendly message.
+        tracker.reportError('Failed ' + msg, e);
+        reject(e);
+      };
+
+      // Update the progress message to say the current running task.
+      tracker.setMessage(msg);
+      // Run the expensive task with a delay that gives enough time for the
+      // UI to update.
+      setTimeout(function() {
+        try {
+          let start = Date.now();
+          task()
+              .then(function(value) {
+                /* tslint:disable */
+                console.log(msg, ':', Date.now() - start, 'ms');
+                // Update the progress value.
+                tracker.updateProgress(incProgressValue);
+                // Return the result to be used by other tasks.
+                resolve(value);
+              })
+              .catch(handleError);
+        } catch (e) {
+          handleError(e);
+        }
+      }, ASYNC_TASK_DELAY);
+    });
+  }
+
+  /**
+   * Returns a query selector with escaped special characters that are not
+   * allowed in a query selector.
+   */
+  export function escapeQuerySelector(querySelector: string): string {
+    return querySelector.replace(/([:.\[\],/\\\(\)])/g, '\\$1');
+  }
+
+  // For unit conversion.
+  export const MEMORY_UNITS = [
+    // Atomic unit.
+    {symbol: 'B'},
+    // numUnits specifies how many previous units this unit contains.
+    {symbol: 'KB', numUnits: 1024}, {symbol: 'MB', numUnits: 1024},
+    {symbol: 'GB', numUnits: 1024}, {symbol: 'TB', numUnits: 1024},
+    {symbol: 'PB', numUnits: 1024}
+  ];
+  export const TIME_UNITS = [
+    // Atomic unit. Finest granularity in TensorFlow stat collection.
+    {symbol: 'µs'},
+    // numUnits specifies how many previous units this unit contains.
+    {symbol: 'ms', numUnits: 1000}, {symbol: 's', numUnits: 1000},
+    {symbol: 'min', numUnits: 60}, {symbol: 'hr', numUnits: 60},
+    {symbol: 'days', numUnits: 24}
+  ];
+
+  /**
+   * Returns the human readable version of the unit.
+   * (e.g. 1.35 GB, 23 MB, 34 ms, 6.53 min etc).
+   */
+  export function convertUnitsToHumanReadable(value, units, unitIndex) {
+    unitIndex = unitIndex == null ? 0 : unitIndex;
+    if (unitIndex + 1 < units.length &&
+        value >= units[unitIndex + 1].numUnits) {
+      return tf.graph.util.convertUnitsToHumanReadable(
+          value / units[unitIndex + 1].numUnits, units, unitIndex + 1);
+    }
+    // toPrecision() has the tendency to return a number in scientific
+    // notation and (number - 0) brings it back to normal notation.
+    return (value.toPrecision(3) - 0) + ' ' + units[unitIndex].symbol;
+  }
+
+  export function hasDisplayableNodeStats(stats: NodeStats) {
+    if (stats &&
+        (stats.totalBytes > 0 || stats.getTotalMicros() > 0 ||
+         stats.outputSize)) {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Given a list of strings, it returns a new list of strings with the longest
+   * common prefix removed. If the common prefix is one of the strings in the
+   * list, it returns the original strings.
+   */
+  export function removeCommonPrefix(strings: string[]) {
+    if (strings.length < 2) {
+      return strings;
+    }
+
+    let index = 0;
+    let largestIndex = 0;
+    // Find the shortest name across all strings.
+    let minLength = _.min(_.map(strings, str => str.length));
+    while (true) {
+      index++;
+      let prefixes = _.map(strings, str => str.substring(0, index));
+      let allTheSame = prefixes.every((prefix, i) => {
+        return (i === 0 ? true : prefix === prefixes[i - 1]);
+      });
+      if (allTheSame) {
+        if (index >= minLength) {
+          // There is a string whose whole name is a prefix to other string.
+          // In this case, we return the original list of string.
+          return strings;
+        }
+        largestIndex = index;
+      } else {
+        break;
+      }
+    }
+    return _.map(strings, str => str.substring(largestIndex));
+  }
+
+  /**
+   * Given a queryString, aka ?foo=1&bar=2, return the object representation.
+   */
+  export function getQueryParams(queryString: string) {
+    if (queryString.charAt(0) === '?') {
+      queryString = queryString.slice(1);
+    }
+
+    let queryParams = _.chain(queryString.split('&'))
+                          .map((item) => {
+                            if (item) {
+                              return item.split('=');
+                            }
+                          })
+                          .compact()
+                          .value();
+
+    return _.object(queryParams);
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/BUILD b/tensorflow/tensorboard/components/tf_graph_controls/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2e2d3b7b475211e9ae4a6e871de8542a6d337ebc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls/BUILD
@@ -0,0 +1,54 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_controls",
+    srcs = [
+        "tf-graph-controls.html",
+    ],
+    path = "/tf-graph-controls",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "@org_polymer",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_radio_group",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-controls.html",
+    ],
+    destdir = "tf-graph-controls",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_controls/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c47cb90a03ecc1762d5a46912b83cd82fe1021ea
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_controls/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-graph-controls/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_controls",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/demo/index.html b/tensorflow/tensorboard/components/tf_graph_controls/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..8b12641b28e328351bd7321c43959a91fba56dcc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls/demo/index.html
@@ -0,0 +1,49 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-controls.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Controls Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 700px;
+    position: relative;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-controls-demo">
+      <template>
+        <tf-graph-controls
+            id="controls"
+            color-by="structure"
+        ></tf-graph-controls>
+      </template>
+      <script>
+        Polymer({
+          is: "tf-graph-controls-demo",
+        });
+      </script>
+    </dom-module>
+    <div id="demo-container">
+      <tf-graph-controls-demo></tf-graph-controls-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-controls.html b/tensorflow/tensorboard/components/tf_graph_controls/tf-graph-controls.html
similarity index 99%
rename from tensorflow/tensorboard/components/tf_graph/tf-graph-controls.html
rename to tensorflow/tensorboard/components/tf_graph_controls/tf-graph-controls.html
index f2a1b5658f21f19c58140f907cba370d6ceb36db..10faf29bbccd5b2448d31cb4c8b9378bdfb46ff4 100644
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph-controls.html
+++ b/tensorflow/tensorboard/components/tf_graph_controls/tf-graph-controls.html
@@ -22,6 +22,7 @@ limitations under the License.
 <link rel="import" href="../paper-tooltip/paper-tooltip.html">
 <link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
 <link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
 
 <dom-module id="tf-graph-controls">
 <template>
@@ -586,6 +587,8 @@ table.tf-graph-controls td.input-element-table-data {
   </template>
   </div>
 </template>
+</dom-module>
+
 <script>
 (function() { // Private scope.
 /**
@@ -817,8 +820,10 @@ Polymer({
       graphPath = graphPath.substring(slashIndex + 1);
     }
     this.$.graphdownload.setAttribute('download', graphPath + '.png');
-  }
+  },
+  _statsNotNull: function(stats) {
+    return stats !== null;
+  },
 });
 })(); // Closing private scope.
 </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_controls_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..d5f9a76eb2a66535083b9ffb3cc1c465b95648db
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/BUILD
@@ -0,0 +1,32 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_controls_d3v4",
+    srcs = [
+        "tf-graph-controls.html",
+    ],
+    path = "/tf-graph-controls",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "@org_polymer",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_radio_group",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c47cb90a03ecc1762d5a46912b83cd82fe1021ea
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_controls/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-graph-controls/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_controls",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..8b12641b28e328351bd7321c43959a91fba56dcc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/index.html
@@ -0,0 +1,49 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-controls.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Controls Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 700px;
+    position: relative;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-controls-demo">
+      <template>
+        <tf-graph-controls
+            id="controls"
+            color-by="structure"
+        ></tf-graph-controls>
+      </template>
+      <script>
+        Polymer({
+          is: "tf-graph-controls-demo",
+        });
+      </script>
+    </dom-module>
+    <div id="demo-container">
+      <tf-graph-controls-demo></tf-graph-controls-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_controls_d3v4/tf-graph-controls.html b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/tf-graph-controls.html
new file mode 100644
index 0000000000000000000000000000000000000000..10faf29bbccd5b2448d31cb4c8b9378bdfb46ff4
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/tf-graph-controls.html
@@ -0,0 +1,829 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-menu/paper-menu.html">
+<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
+<link rel="import" href="../paper-radio-group/paper-radio-group.html">
+<link rel="import" href="../paper-tooltip/paper-tooltip.html">
+<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
+
+<dom-module id="tf-graph-controls">
+<template>
+<style>
+:host {
+  font-size: 12px;
+  color: gray;
+  --paper-font-subhead: {
+    font-size: 14px;
+    color: gray;
+  };
+  --paper-dropdown-menu-icon: {
+    width: 15px;
+    height: 15px;
+  };
+  --paper-dropdown-menu-button: {
+    padding: 0;
+  };
+  --paper-dropdown-menu-input: {
+    padding: 0;
+  };
+  --paper-item-min-height: 30px;
+}
+
+paper-button[raised].keyboard-focus {
+  font-weight: normal;
+}
+
+.run-dropdown {
+  --paper-input-container: {
+    padding: 9px 0 0 25px;
+  };
+}
+
+.color-dropdown {
+  --paper-input-container: {
+    padding: 9px 0 0 13px;
+  };
+}
+
+table {
+  border-collapse: collapse;
+  border-spacing: 0;
+}
+
+table td {
+  padding: 0;
+  margin: 0;
+}
+
+.allcontrols {
+  width: 188px;
+  padding: 0 30px;
+}
+
+.legend-holder {
+  position: absolute;
+  bottom: 0;
+  padding-bottom: 10px;
+}
+
+paper-radio-button {
+  display: block;
+  padding: 5px;
+}
+svg.icon {
+  width: 60px;
+  height: 18px;
+}
+.icon ellipse {
+  rx: 10px;
+  ry: 5px;
+  stroke: #CCC;
+  stroke-width: 1px;
+  fill: #FFFFFF;
+  cy: 10px;
+}
+.icon rect {
+  height: 14px;
+  width: 35px;
+  rx: 5px;
+  ry: 5px;
+  stroke: #CCC;
+  stroke-width: 2px;
+  fill: #D9D9D9;
+}
+.domainValues {
+  margin-bottom: 10px;
+  width: 165px;
+}
+.domainStart {
+  float: left;
+}
+.domainEnd {
+  float: right;
+}
+.colorBox {
+  width: 20px;
+}
+
+.image-icon {
+  width: 24px;
+  height: 24px;
+}
+
+.help-icon {
+  height: 15px;
+  margin: 0;
+  padding: 0;
+}
+
+.gray {
+  color: #666;
+}
+
+.title {
+  font-size: 16px;
+  margin: 8px 5px 8px 0;
+  color: black;
+}
+.title small {
+  font-weight: normal;
+}
+.deviceList, .xlaClusterList {
+  max-height: 200px;
+  overflow-y: auto;
+}
+
+#file {
+  padding: 8px 0;
+}
+
+.color-legend-row {
+  clear: both;
+  height: 20px;
+  margin-top: 5px;
+  position: relative;
+}
+
+.color-legend-row svg {
+  position: absolute;
+  top: -1px;
+  width: 40px;
+}
+
+.color-legend-row span.color-legend-value {
+  margin-left: 60px;
+}
+
+#grey-rect {
+  fill: #eee;
+  stroke: #a6a6a6;
+}
+
+#faded-rect {
+  fill: url(#rectHatch);
+  stroke: var(--tb-graph-faded);
+}
+
+.button-text {
+  text-transform: none;
+  padding: 8px 18px 0 18px;
+  font-size: 14px
+}
+
+.upload-button {
+  width: 165px;
+  height: 25px;
+  text-transform: none;
+  margin-top: 4px;
+}
+
+.iconbutton {
+  padding: 2px;
+  width: 30px;
+  height: 30px;
+  color: var(--paper-orange-500);
+}
+
+.hidden-input {
+  height: 0px;
+  width: 0px;
+  overflow:hidden;
+}
+
+.allcontrols .control-holder {
+  display: flex;
+  clear: both;
+}
+
+.allcontrols .control-holder paper-radio-group {
+  margin-top: 5px;
+}
+
+span.counter {
+  font-size: 13px;
+  color: gray;
+}
+
+.runs paper-item {
+  --paper-item: {
+    white-space: nowrap;
+  }
+}
+
+table.control-holder {
+  border: 0;
+  border-collapse: collapse;
+}
+
+table.tf-graph-controls td.input-element-table-data {
+  padding: 0 0 0 20px;
+}
+
+/** Override inline styles that suppress pointer events for disabled buttons. Otherwise, the */
+/*  tooltips do not appear. */
+#color-by-radio-group paper-radio-button {
+  pointer-events: auto !important;
+}
+</style>
+<svg width="0" height="0">
+  <defs>
+    <g id="legend-rect">
+      <rect x="1" y="1" stroke-width="2px" height="14" width="35" rx="5" ry="5"></rect>
+    </g>
+    <g id="grey-rect">
+       <use xmlns:xlink="http://www.w3.org/1999/xlink"
+            xlink:href="#legend-rect"/>
+     </g>
+     <g id="faded-rect">
+       <use xmlns:xlink="http://www.w3.org/1999/xlink"
+            xlink:href="#legend-rect"/>
+     </g>
+  </defs>
+</svg>
+<div class="allcontrols">
+  <div class="control-holder">
+    <paper-icon-button icon="aspect-ratio" class="iconbutton" on-click="fit" alt="Fit to screen">
+    </paper-icon-button>
+    <paper-button class="button-text" on-click="fit">Fit to screen
+    </paper-button>
+  </div>
+  <div class="control-holder">
+    <paper-icon-button icon="file-download" class="iconbutton" on-click="download" alt="Download PNG">
+    </paper-icon-button>
+    <paper-button class="button-text" on-click="download">Download PNG
+    </paper-button>
+    <a href="#" id="graphdownload" class="title" download="graph.png">
+    </a>
+  </div>
+  <div class="control-holder runs">
+    <div class="title">Run <span class="counter">([[datasets.length]])</span></div>
+    <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
+      <paper-menu id="select" class="dropdown-content" selected="{{selectedDataset}}">
+        <template is="dom-repeat" items="[[datasets]]">
+          <paper-item>[[item.name]]</paper-item>
+        </template>
+      </paper-menu>
+    </paper-dropdown-menu>
+  </div>
+  <template is="dom-if" if="[[showSessionRunsDropdown]]">
+    <div class="control-holder">
+      <div class="title">Session runs <span class="counter">([[_numSessionRuns(metadataTags)]])</span></div>
+      <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
+        <paper-menu id="select" class="dropdown-content" selected="{{selectedMetadataTag}}">
+          <template is="dom-repeat" items="[[metadataTags]]">
+            <paper-item>[[item.tag]]</paper-item>
+          </template>
+          <paper-item>None</paper-item>
+        </paper-menu>
+      </paper-dropdown-menu>
+    </div>
+  </template>
+  <template is="dom-if" if="[[showUploadButton]]">
+    <div class="control-holder">
+      <div class="title">Upload</div>
+      <paper-button raised class="text-button upload-button"
+          on-click="_getFile">Choose File</paper-button>
+      <div class="hidden-input">
+        <input type="file" id="file" name="file" on-change="_updateFileInput" />
+      </div>
+    </div>
+  </template>
+  <table class="control-holder">
+    <tr>
+      <td class="title">Trace inputs</td>
+      <td class="input-element-table-data">
+        <paper-toggle-button id="trace-inputs"></paper-toggle-button>
+      </td>
+    </tr>
+    <template is="dom-if" if="[[healthPillsFeatureEnabled]]">
+      <tr>
+        <td class="title">Show health pills</td>
+        <td class="input-element-table-data">
+          <paper-toggle-button checked="{{healthPillsToggledOn}}"></paper-toggle-button>
+        </td>
+      </tr>
+    </template>
+  </table>
+  <div class="control-holder">
+    <div class="title">Color</div>
+    <paper-radio-group id="color-by-radio-group" selected="{{colorBy}}">
+      <paper-radio-button name="structure">Structure</paper-radio-button>
+
+      <paper-radio-button name="device">Device</paper-radio-button>
+
+      <paper-radio-button id="xla-cluster-radio-button"
+                          name="xla_cluster"
+                          disabled="[[!_xlaClustersProvided(renderHierarchy)]]">
+        XLA Cluster
+      </paper-radio-button>
+      <paper-tooltip for="xla-cluster-radio-button" position="right">
+        Coloring by XLA cluster is only enabled if at least 1 op specifies an XLA cluster.
+      </paper-tooltip>
+
+      <paper-radio-button id="compute-time-radio-button"
+                          name="compute_time"
+                          disabled="[[!stats]]">
+        Compute time
+      </paper-radio-button>
+      <paper-tooltip for="compute-time-radio-button" position="right">
+        Coloring by compute time is only enabled if the RunMetadata proto is passed to the
+        FileWriter when a specific session is run.
+      </paper-tooltip>
+
+      <paper-radio-button id="memory-radio-button"
+                          name="memory"
+                          disabled="[[!stats]]">
+        Memory
+      </paper-radio-button>
+      <paper-tooltip for="memory-radio-button" position="right">
+        Coloring by memory is only enabled if the RunMetadata proto is passed to the
+        FileWriter when a specific session is run.
+      </paper-tooltip>
+    </paper-radio-group>
+  </div>
+  <div>
+    <template is="dom-if" if="[[_isGradientColoring(stats, colorBy)]]">
+      <svg width="140" height="20" style="margin: 0 5px" class="color-text">
+        <defs>
+          <linearGradient id="linearGradient" x1="0%" y1="0%" x2="100%" y2="0%">
+            <stop class="start" offset="0%"
+                stop-color$="[[_currentGradientParams.startColor]]"/>
+            <stop class="end" offset="100%"
+                stop-color$="[[_currentGradientParams.endColor]]"/>
+          </linearGradient>
+        </defs>
+        <rect x="0" y="0" width="135" height="20" fill="url(#linearGradient)"
+            stroke="black" />
+      </svg>
+      <div class="domainValues color-text">
+        <div class="domainStart">[[_currentGradientParams.minValue]]</div>
+        <div class="domainEnd">[[_currentGradientParams.maxValue]]</div>
+      </div>
+      <br style="clear: both">
+      <div>Devices included in stats:</div>
+      <div class="deviceList">
+        <table>
+        <template is="dom-repeat" items="[[_getDevices(devicesForStats)]]">
+          <tr>
+            <td>
+              <input type="checkbox" value$="[[item.device]]" checked$="[[item.used]]" on-click="_deviceCheckboxClicked"/>
+            </td>
+            <td>
+              <div>
+                <span>[[item.suffix]]</span>
+                <template is="dom-if" if="[[item.ignoredMsg]]">
+                  <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
+                  <paper-tooltip position="right" animation-delay="0">[[item.ignoredMsg]]</paper-tooltip>
+                </template>
+              </div>
+            </td>
+          </tr>
+        </template>
+        </table>
+      </div>
+    </template>
+    <template is="dom-if" if="[[_equals(colorBy, 'structure')]]">
+      <div class="color-text">
+        <div class="color-legend-row">
+          <div style="position: absolute;">
+            colors
+          </div>
+          <span class="color-legend-value">same substructure</span>
+        </div>
+        <div class="color-legend-row">
+          <svg>
+            <use xmlns:xlink="http://www.w3.org/1999/xlink"
+                 xlink:href="#grey-rect" x="0" y="0"/>
+          </svg>
+          <span class="color-legend-value">unique substructure</span>
+        </div>
+      </div>
+    </template>
+    <template is="dom-if" if="[[_equals(colorBy, 'device')]]">
+      <div class="color-text">
+        <div class="deviceList">
+          <table>
+          <template is="dom-repeat" items="[[colorByParams.device]]">
+            <tr>
+              <td style$="[[_getBackgroundColor(item.color)]]">
+                <div class="colorBox"></div>
+              </td>
+              <td>
+                <div>[[item.device]]</div>
+              </td>
+            </tr>
+          </template>
+          </table>
+        </div>
+        <br/>
+        <div class="color-legend-row">
+          <svg>
+            <use xmlns:xlink="http://www.w3.org/1999/xlink"
+                 xlink:href="#grey-rect" x="0" y="0"/>
+          </svg>
+          <span class="color-legend-value">unknown device</span>
+        </div>
+      </div>
+    </template>
+    <template is="dom-if" if="[[_equals(colorBy, 'xla_cluster')]]">
+      <div class="color-text">
+        <div class="xlaClusterList">
+          <table>
+          <template is="dom-repeat" items="[[colorByParams.xla_cluster]]">
+            <tr>
+              <td style$="[[_getBackgroundColor(item.color)]]">
+                <div class="colorBox"></div>
+              </td>
+              <td>
+                <div>[[item.xla_cluster]]</div>
+              </td>
+            </tr>
+          </template>
+          </table>
+        </div>
+        <br/>
+        <div class="color-legend-row">
+          <svg>
+            <use xmlns:xlink="http://www.w3.org/1999/xlink"
+                 xlink:href="#grey-rect" x="0" y="0"/>
+          </svg>
+          <span class="color-legend-value">unknown XLA cluster</span>
+        </div>
+      </div>
+    </template>
+    <template is="dom-if" if="[[_statsNotNull(stats)]]">
+      <div class="color-legend-row">
+        <svg>
+          <use xmlns:xlink="http://www.w3.org/1999/xlink"
+                xlink:href="#faded-rect" x="0" y="0"/>
+        </svg>
+        <span class="color-legend-value">unused substructure</span>
+      </div>
+    </template>
+  </div>
+  <!--
+    Due to limited vertical space on the left sidebar, hide the legend whenever
+    we show a list of devices to include in stats.
+  -->
+  <template is="dom-if" if="[[!_isGradientColoring(stats, colorBy)]]">
+    <div class="legend-holder">
+      <table>
+        <tr>
+          <td><div class="title">Graph</div></td>
+          <td>(* = expandable)</td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon">
+              <rect transform="translate(3, 1)" height="14" width="35"
+                  rx="5" ry="5"/>
+            </svg>
+          </td>
+          <td>Namespace<span class="gray">*</span></td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" preserveAspectRatio="xMinYMid meet"
+                viewBox="0 0 10 10">
+              <use xlink:href="#op-node-stamp" fill="white" stroke="#ccc" x="9.5"
+                y="6" />
+            </svg>
+          </td>
+          <td>OpNode</td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet"
+                viewBox="0 0 12 12">
+              <use xlink:href="#op-series-horizontal-stamp" fill="white"
+                  stroke="#ccc" x="2" y="2"/>
+            </svg>
+          </td>
+          <td>Unconnected series<span class="gray">*</span></td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px"
+                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
+              <use xlink:href="#op-series-vertical-stamp"
+                  fill="white" stroke="#ccc" x="2" y="2"/>
+            </svg>
+          </td>
+          <td>Connected series<span class="gray">*</span></td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon">
+              <circle fill="white" stroke="#848484" cx="10" cy="10" r="5"/>
+            </svg>
+          </td>
+          <td>Constant</td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="image-icon" viewBox="0 0 12 12" width="24" height="24">
+              <use x="0" y="0" class="image-icon" xlink:href="#summary-icon"/>
+            </svg>
+          </td>
+          <td>Summary</td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px"
+                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
+              <defs>
+                <marker id="ref-arrowhead-legend" fill="#bbb" markerWidth="10"
+                    markerHeight="10" refX="1" refY="5" orient="auto">
+                  <path d="M 10,0 L 0,5 L 10,10 C 7,7 7,3 10,0"/>
+                </marker>
+              </defs>
+              <path stroke="#bbb"
+                  d="M2 9 l 23 0" stroke-linecap="round" />
+            </svg>
+          </td>
+          <td>Dataflow edge</td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px"
+                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
+              <path stroke="#bbb"
+                d="M2 9 l 23 0" stroke-linecap="round" stroke-dasharray="2, 2" />
+            </svg>
+          </td>
+          <td>Control dependency edge</td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px"
+                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
+              <path marker-start="url(#ref-arrowhead-legend)"
+                stroke="#bbb" d="M2 9 l 23 0"
+                stroke-linecap="round" />
+            </svg>
+          </td>
+          <td>Reference edge</td>
+        </tr>
+      </table>
+    </div>
+  </template>
+  </div>
+</template>
+</dom-module>
+
+<script>
+(function() { // Private scope.
+/**
+ * Stats from device names that match these regexes will be excluded by default.
+ * The user can still turn on a device by selecting the checkbox in the device list.
+ * See b/29089982 for context.
+ */
+var DEVICE_NAMES_EXCLUDE = [
+  {
+    regex: /gpu:[0-9]+$/,
+    msg: 'Excluded by default since this is a CPU thread setting up GPU kernels.'
+  }
+];
+
+Polymer({
+  is: 'tf-graph-controls',
+  properties: {
+    // Public API.
+    stats: {
+      value: null,
+      type: Object,
+      observer: '_statsChanged'
+    },
+    devicesForStats: {
+      value: null,
+      type: Object,
+      notify: true,
+      readonly: true,
+    },
+    colorBy: {
+      type: String,
+      value: 'structure',
+      notify: true,
+      readonly: true
+    },
+    colorByParams: Object,
+    datasets: {
+      type: Array,
+      observer: '_datasetsChanged'
+    },
+    renderHierarchy: {
+      type: Object,
+      notify: true,
+    },
+    metadataTags: {
+      type: Array,
+      computed: '_getMetadataTags(selectedDataset, datasets)'
+    },
+    selectedDataset: {
+      type: Number,
+      notify: true,
+      value: 0,
+      observer: '_selectedDatasetChanged'
+    },
+    selectedFile: {
+      type: Object,
+      notify: true
+    },
+    selectedMetadataTag: {
+      type: Number,
+      notify: true,
+      value: -1
+    },
+    _currentGradientParams: {
+      type: Object,
+      computed: '_getCurrentGradientParams(colorByParams, colorBy)'
+    },
+    showSessionRunsDropdown: {
+      type: Boolean,
+      value: true
+    },
+    showUploadButton: {
+      type: Boolean,
+      value: true
+    },
+    // This stores whether the feature for showing health pills is enabled in the first place.
+    healthPillsFeatureEnabled: Boolean,
+    // This stores whether to show health pills. Only relevant if healthPillsFeatureEnabled. The
+    // user can toggle this value.
+    healthPillsToggledOn: {
+      type: Boolean,
+      notify: true,
+    },
+  },
+  listeners: {
+    'trace-inputs.change': '_traceInputToggleChanged'
+  },
+  _traceInputToggleChanged: function(event) {
+    // Flip the state of the trace inputs flag.
+    this.renderHierarchy.traceInputs = event.target.active;
+    tf.graph.scene.node.traceInputs(this.renderHierarchy);
+  },
+  _xlaClustersProvided: function(renderHierarchy) {
+    return renderHierarchy &&
+        renderHierarchy.hierarchy &&
+        renderHierarchy.hierarchy.xlaClusters.length > 0;
+  },
+  _statsChanged: function(stats) {
+    if (stats == null) {
+      return;
+    }
+    var devicesForStats = {};
+    var devices = _.each(stats.dev_stats, function(d) {
+      // Avoid device names that are ignored by default.
+      var exclude = _.some(DEVICE_NAMES_EXCLUDE, function(rule) {
+        return rule.regex.test(d.device);
+      });
+      if (!exclude) {
+        devicesForStats[d.device] = true;
+      }
+    });
+    this.set('devicesForStats', devicesForStats);
+  },
+  _getDevices: function(devicesForStats) {
+    var devices = _.map(this.stats.dev_stats, function(d) {
+      return d.device;
+    });
+    // Devices names can be long so we remove the longest common prefix
+    // before showing the devices in a list.
+    var suffixes = tf.graph.util.removeCommonPrefix(devices);
+    return _.map(devices, function(device, i) {
+      var ignoredMsg = null;
+      _.each(DEVICE_NAMES_EXCLUDE, function(rule) {
+        if (rule.regex.test(device)) {
+          ignoredMsg = rule.msg;
+        }
+      });
+      return {
+        device: device,
+        suffix: suffixes[i],
+        used: devicesForStats[device],
+        ignoredMsg: ignoredMsg
+      };
+    });
+  },
+  _deviceCheckboxClicked: function(checkbox) {
+    // Update the device map.
+    var devicesForStats = _.extend({}, this.devicesForStats);
+    var device = checkbox.target.value;
+    if (checkbox.target.checked) {
+      devicesForStats[device] = true;
+    } else {
+      delete devicesForStats[device];
+    }
+    this.set('devicesForStats', devicesForStats);
+  },
+  _numSessionRuns: function(metadataTags) {
+    return metadataTags != null ? metadataTags.length : 0;
+  },
+  _getBackgroundColor: function(color) {
+    return 'background-color:' + color;
+  },
+  fit: function() {
+    document.querySelector('#scene').fit();
+  },
+  _isGradientColoring: function(stats, colorBy) {
+    return ["compute_time", "memory"].indexOf(colorBy) !== -1
+        && stats != null;
+  },
+  _equals: function(a, b) {
+    return a === b;
+  },
+  _getCurrentGradientParams: function(colorByParams, colorBy) {
+    if (!this._isGradientColoring(this.stats, colorBy)) {
+      return;
+    }
+    var params = colorByParams[colorBy];
+    var minValue = params.minValue;
+    var maxValue = params.maxValue;
+    if (colorBy === 'memory') {
+      minValue = tf.graph.util.convertUnitsToHumanReadable(
+          minValue, tf.graph.util.MEMORY_UNITS);
+      maxValue = tf.graph.util.convertUnitsToHumanReadable(
+          maxValue, tf.graph.util.MEMORY_UNITS);
+    } else if (colorBy === 'compute_time') {
+      minValue = tf.graph.util.convertUnitsToHumanReadable(
+          minValue, tf.graph.util.TIME_UNITS);
+      maxValue = tf.graph.util.convertUnitsToHumanReadable(
+          maxValue, tf.graph.util.TIME_UNITS);
+    }
+    return {
+      minValue: minValue,
+      maxValue: maxValue,
+      startColor: params.startColor,
+      endColor: params.endColor
+    };
+  },
+  download: function() {
+    this.$.graphdownload.click();
+  },
+  _updateFileInput: function(e) {
+    var file = e.target.files[0];
+    if (!file) {
+      return;
+    }
+    this._setDownloadFilename(file.name);
+    this.set('selectedFile', e);
+  },
+  _datasetsChanged: function(newDatasets, oldDatasets) {
+    if (oldDatasets != null || this.selected == null) {
+      // Select the first dataset by default.
+      this.set('selectedDataset', 0);
+      this._setDownloadFilename(this.datasets[this.selectedDataset].path);
+    }
+  },
+  _getMetadataTags: function(selectedDataset, datasets) {
+    return this.datasets[selectedDataset].runMetadata;
+  },
+  _selectedDatasetChanged: function(newDataset, oldDataset) {
+    if (this.datasets) {
+      this.set('selectedMetadataTag', -1);
+      this.set('colorBy', 'structure');
+      this.$['trace-inputs'].active = false; // Set trace input to off-state.
+      this._setDownloadFilename(this.datasets[newDataset].path);
+    }
+  },
+  _getFile: function() {
+    this.$$("#file").click();
+  },
+  _setDownloadFilename: function(graphPath) {
+    // Strip off everything before the last "/" and strip off the file
+    // extension in order to get the name of the PNG for the graph.
+    var dotIndex = graphPath.lastIndexOf('.');
+    if (dotIndex) {
+      graphPath = graphPath.substring(0, dotIndex);
+    }
+    var slashIndex = graphPath.lastIndexOf('/');
+    if (slashIndex) {
+      graphPath = graphPath.substring(slashIndex + 1);
+    }
+    this.$.graphdownload.setAttribute('download', graphPath + '.png');
+  },
+  _statsNotNull: function(stats) {
+    return stats !== null;
+  },
+});
+})(); // Closing private scope.
+</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..367baeb67b83d668840061a93c05d1e1df17d08f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/BUILD
@@ -0,0 +1,37 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_d3v4",
+    srcs = [
+        "tf-graph.html",
+        "tf-graph-minimap.html",
+        "tf-graph-scene.html",
+    ],
+    path = "/tf-graph",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_flex_layout",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_radio_group",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..524d0ff7679a40b470502b028fca3b76c761108f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..52e2f0b9340950ed5f873cba17c8bbf2aee62e6a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/index.html
@@ -0,0 +1,92 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <!-- We color ops in the graph by XLA cluster. -->
+        <tf-graph id="graph" color-by="xla_cluster"></tf-graph>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Render the graph.
+              this.$.graph.set('basicGraph', slimGraph);
+              this.$.graph.set('graphHierarchy', graphHierarchy);
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-demo></tf-graph-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-minimap.html b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-minimap.html
new file mode 100644
index 0000000000000000000000000000000000000000..5fc16c05207fd082336717a6da2563e9eafc3985
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-minimap.html
@@ -0,0 +1,88 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<script src="../tf-graph-common/minimap.js"></script>
+
+<dom-module id="tf-graph-minimap">
+<template>
+<style>
+:host {
+  background-color:white;
+  transition: opacity .3s linear;
+  pointer-events: auto;
+}
+
+:host.hidden {
+  opacity: 0;
+  pointer-events: none;
+}
+
+canvas {
+  border: 1px solid #999;
+}
+
+rect {
+  fill: white;
+  stroke: #111111;
+  stroke-width: 1px;
+  fill-opacity: 0;
+  filter: url(#minimapDropShadow);
+  cursor: move;
+}
+
+svg {
+  position: absolute;
+}
+</style>
+<svg>
+  <defs>
+    <filter id="minimapDropShadow" x="-20%" y="-20%" width="150%" height="150%">
+      <feOffset result="offOut" in="SourceGraphic" dx="1" dy="1"></feOffset>
+      <feColorMatrix result="matrixOut" in="offOut" type="matrix" values="0.1 0 0 0 0 0 0.1 0 0 0 0 0 0.1 0 0 0 0 0 0.5 0"></feColorMatrix>
+      <feGaussianBlur result="blurOut" in="matrixOut" stdDeviation="2"></feGaussianBlur>
+      <feBlend in="SourceGraphic" in2="blurOut" mode="normal"></feBlend>
+    </filter>
+  </defs>
+  <rect></rect>
+</svg>
+<canvas class="first"></canvas>
+<!-- Additional canvas to use as buffer to avoid flickering between updates -->
+<canvas class="second"></canvas>
+<canvas class="download"></canvas>
+</template>
+<script>
+Polymer({
+  is: 'tf-graph-minimap',
+
+  /**
+   * Initializes the minimap and returns a minimap object to notify when
+   * things update.
+   *
+   * @param svg The main svg element.
+   * @param zoomG The svg group used for panning and zooming the main svg.
+   * @param mainZoom The main zoom behavior.
+   * @param maxWandH The maximum width/height for the minimap.
+   * @param labelPadding Padding in pixels due to the main graph labels.
+   */
+  init: function(svg, zoomG, mainZoom, maxWAndH, labelPadding) {
+    return new tf.scene.Minimap(svg, zoomG, mainZoom, this, maxWAndH,
+        labelPadding);
+  }
+});
+</script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-scene.html b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-scene.html
new file mode 100644
index 0000000000000000000000000000000000000000..10a65f54d52035d5d4efdd1e0b24f95e0c14c207
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-scene.html
@@ -0,0 +1,1052 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="tf-graph-minimap.html">
+
+<!--
+  A module that takes a render hierarchy as input and produces an SVG DOM using
+  dagre and d3.
+-->
+<dom-module id="tf-graph-scene">
+<template>
+<style>
+:host {
+  display: flex;
+  width: 100%;
+  font-size: 20px;
+}
+
+::content #svg {
+  overflow: hidden;
+  flex: 1;
+  height: 100%;
+  width: 100%;
+}
+
+::content #hidden {
+  position: fixed;
+  top: 0px;
+  visibility: hidden;
+}
+
+/* --- Node and annotation-node for Metanode --- */
+
+::content .meta > .nodeshape > rect,
+::content .meta > .annotation-node > rect {
+  cursor: pointer;
+  fill: hsl(0, 0%, 70%);
+}
+
+::content .node.meta.highlighted > .nodeshape > rect,
+::content .node.meta.highlighted > .annotation-node > rect {
+  stroke-width: 2;
+}
+
+::content .annotation.meta.highlighted > .nodeshape > rect,
+::content .annotation.meta.highlighted > .annotation-node > rect {
+  stroke-width: 1;
+}
+
+::content .meta.selected > .nodeshape > rect,
+::content .meta.selected > .annotation-node > rect {
+  stroke: red;
+  stroke-width: 2;
+}
+
+::content .node.meta.selected.expanded > .nodeshape > rect,
+::content .node.meta.selected.expanded > .annotation-node > rect {
+  stroke: red;
+  stroke-width: 3;
+}
+
+::content .annotation.meta.selected > .nodeshape > rect,
+::content .annotation.meta.selected > .annotation-node > rect {
+  stroke: red;
+  stroke-width: 2;
+}
+
+::content .node.meta.selected.expanded.highlighted > .nodeshape > rect,
+::content .node.meta.selected.expanded.highlighted > .annotation-node > rect {
+  stroke: red;
+  stroke-width: 4;
+}
+
+::content .faded,
+::content .faded rect,
+::content .faded ellipse,
+::content .faded path,
+::content .faded use,
+::content #rectHatch line,
+::content #ellipseHatch line {
+  color: #e0d4b3 !important;
+  fill: white;
+  stroke: #e0d4b3 !important;
+}
+
+
+::content .faded path {
+  stroke-width: 1px !important;
+}
+
+::content .faded rect {
+  fill: url(#rectHatch) !important;
+}
+
+::content .faded ellipse,
+::content .faded use {
+  fill: url(#ellipseHatch) !important;
+}
+
+::content .faded text {
+  opacity: 0;
+}
+
+/* Rules used for input-tracing. */
+::content .input-highlight > * > rect,
+::content .input-highlight > * > ellipse,
+::content .input-highlight > * > use
+{
+  fill: white;
+  stroke: #ff9800 !important;
+}
+
+/*  - Faded non-input styling */
+::content .non-input > * > rect,
+::content .non-input > * > ellipse,
+::content .non-input > * > use,
+/* For Const nodes. */
+::content .non-input > * > .constant:not([class*="input-highlight"]) >
+  .annotation-node > ellipse,
+/* For styling of annotation nodes of non-input nodes. */
+::content .non-input > g > .annotation > .annotation-node > rect {
+  stroke: #e0d4b3 !important;
+  stroke-width: inherit;
+  stroke-dasharray: inherit;
+}
+
+
+::content .non-input path {
+  visibility: hidden;
+}
+
+::content .non-input > .nodeshape > rect,
+::content .non-input > .annotation-node > rect,
+/* For styling of annotation nodes of non-input nodes. */
+::content .non-input > g > .annotation > .annotation-node > rect
+{
+  fill: url(#rectHatch) !important;
+}
+
+::content .non-input ellipse,
+::content .non-input use {
+  fill: url(#ellipseHatch) !important;
+}
+
+::content .non-input > text {
+  opacity: 0;
+}
+
+::content .non-input .annotation > .annotation-edge {
+  marker-end: url(#annotation-arrowhead-faded);
+}
+
+::content .non-input .annotation > .annotation-edge.refline {
+  marker-start: url(#ref-annotation-arrowhead-faded);
+}
+
+/* Input edges. */
+::content .input-edge-highlight > text {
+  fill: black !important;
+}
+::content .input-edge-highlight > path,
+::content .input-highlight > .in-annotations > .annotation > .annotation-edge,
+::content .input-highlight-selected > .in-annotations > .annotation >
+.annotation-edge {
+  stroke: #999 !important;
+}
+
+/* Non-input edges. */
+::content .non-input-edge-highlight,
+::content .non-input > g > .annotation > path,
+/* Annotation styles (label and edges respectively). */
+::content .non-input > g >
+.annotation:not(.input-highlight):not(.input-highlight-selected) >
+.annotation-label
+/*.annotation-edge*/
+{
+  visibility: hidden;
+}
+
+/* --- Op Node --- */
+
+::content .op > .nodeshape > ellipse,
+::content .op > .annotation-node > ellipse {
+  cursor: pointer;
+  fill: #fff;
+  stroke: #ccc;
+}
+
+::content .op.selected > .nodeshape > ellipse,
+::content .op.selected > .annotation-node > ellipse {
+  stroke: red;
+  stroke-width: 2;
+}
+
+::content .op.highlighted > .nodeshape > ellipse,
+::content .op.highlighted > .annotation-node > ellipse {
+  stroke-width: 2;
+}
+
+/* --- Series Node --- */
+
+/* By default, don't show the series background <rect>. */
+::content .series > .nodeshape > rect {
+  fill: hsl(0, 0%, 70%);
+  fill-opacity: 0;
+  stroke-dasharray: 5, 5;
+  stroke-opacity: 0;
+  cursor: pointer;
+}
+
+/* Once expanded, show the series background <rect> and hide the <use>. */
+::content .series.expanded > .nodeshape > rect {
+  fill-opacity: 0.15;
+  stroke: hsl(0, 0%, 70%);
+  stroke-opacity: 1;
+}
+::content .series.expanded > .nodeshape > use {
+  visibility: hidden;
+}
+
+/**
+ * TODO(jimbo): Simplify this by applying a stable class name to all <g>
+ * elements that currently have either the nodeshape or annotation-node classes.
+ */
+::content .series > .nodeshape > use ,
+::content .series > .annotation-node > use {
+  stroke: #ccc;
+}
+::content .series.highlighted > .nodeshape > use ,
+::content .series.highlighted > .annotation-node > use {
+  stroke-width: 2;
+}
+::content .series.selected > .nodeshape > use ,
+::content .series.selected > .annotation-node > use {
+  stroke: red;
+  stroke-width: 2;
+}
+
+::content .series.selected > .nodeshape > rect {
+  stroke: red;
+  stroke-width: 2;
+}
+
+::content .annotation.series.selected > .annotation-node > use {
+  stroke: red;
+  stroke-width: 2;
+}
+
+/* --- Bridge Node --- */
+::content .bridge > .nodeshape > rect {
+  stroke: #f0f;
+  opacity: 0.2;
+  display: none;
+}
+
+/* --- Structural Elements --- */
+::content .edge > path.edgeline.structural {
+  stroke: #f0f;
+  opacity: 0.2;
+  display: none;
+}
+
+/* --- Series Nodes --- */
+
+/* Hide the rect for a series' annotation. */
+::content .series > .annotation-node > rect {
+  display: none;
+}
+
+/* --- Node label --- */
+
+
+::content .node > text.nodelabel {
+  cursor: pointer;
+  fill: #444;
+}
+
+::content .meta.expanded > text.nodelabel {
+  font-size: 9px;
+}
+
+::content .series > text.nodelabel {
+  font-size: 8px;
+}
+
+::content .op > text.nodelabel {
+  font-size: 6px;
+}
+
+::content .bridge > text.nodelabel {
+  display: none;
+}
+
+::content .node.meta.expanded > text.nodelabel{
+  cursor: normal;
+}
+
+::content .annotation.meta.highlighted > text.annotation-label {
+  fill: #50A3F7;
+}
+
+::content .annotation.meta.selected > text.annotation-label {
+  fill: #4285F4;
+}
+
+/* --- Annotation --- */
+
+/* only applied for annotations that are not summary or constant.
+(.summary, .constant gets overriden below) */
+::content .annotation > .annotation-node > * {
+  stroke-width: 0.5;
+  stroke-dasharray: 1, 1;
+}
+
+::content .annotation.summary > .annotation-node > *,
+::content .annotation.constant > .annotation-node > * {
+  stroke-width: 1;
+  stroke-dasharray: none;
+}
+
+::content .annotation > .annotation-edge {
+  fill: none;
+  stroke: #aaa;
+  stroke-width: 0.5;
+  marker-end: url(#annotation-arrowhead);
+}
+
+::content .faded .annotation > .annotation-edge {
+  marker-end: url(#annotation-arrowhead-faded);
+}
+
+::content .annotation > .annotation-edge.refline {
+  marker-start: url(#ref-annotation-arrowhead);
+}
+
+::content .faded .annotation > .annotation-edge.refline {
+  marker-start: url(#ref-annotation-arrowhead-faded);
+}
+
+::content .annotation > .annotation-control-edge {
+  stroke-dasharray: 1, 1;
+}
+
+::content #annotation-arrowhead {
+  fill: #aaa;
+}
+
+::content #annotation-arrowhead-faded {
+  fill: #e0d4b3;
+}
+
+::content #ref-annotation-arrowhead {
+  fill: #aaa;
+}
+
+::content #ref-annotation-arrowhead-faded {
+  fill: #e0d4b3;
+}
+
+::content .annotation > .annotation-label {
+  font-size: 5px;
+  cursor: pointer;
+}
+::content .annotation > .annotation-label.annotation-ellipsis {
+  cursor: default;
+}
+
+/* Hide annotations on expanded meta nodes since they're redundant. */
+::content .expanded > .in-annotations,
+::content .expanded > .out-annotations {
+  display: none;
+}
+
+/* --- Annotation: Constant --- */
+
+::content .constant > .annotation-node > ellipse {
+  cursor: pointer;
+  fill: white;
+  stroke: #848484;
+}
+
+::content .constant.selected > .annotation-node > ellipse {
+  fill: white;
+  stroke: red;
+}
+
+::content .constant.highlighted > .annotation-node > ellipse {
+  stroke-width: 1.5;
+}
+
+/* --- Annotation: Summary --- */
+
+::content .summary > .annotation-node > ellipse {
+  cursor: pointer;
+  fill: #DB4437;
+  stroke: #DB4437;
+}
+
+::content .summary.selected > .annotation-node > ellipse {
+  fill: #A52714;
+  stroke: #A52714;
+}
+
+::content .summary.highlighted > .annotation-node > ellipse {
+  stroke-width: 1.5;
+}
+
+/* --- Edge --- */
+
+::content .edge > path.edgeline {
+  fill: none;
+  stroke: #bbb;
+  stroke-linecap: round;
+  stroke-width: 0.75;
+}
+
+/* Labels showing tensor shapes on edges */
+::content .edge > text {
+  font-size: 3.5px;
+  fill: #666;
+}
+
+::content .ref-arrowhead {
+  fill: #bbb;
+}
+
+::content .edge .control-dep {
+  stroke-dasharray: 2, 2;
+}
+
+/* --- Group node expand/collapse button --- */
+
+/* Hides expand/collapse buttons when a node isn't expanded or highlighted. Using
+   incredibly small opacity so that the bounding box of the <g> parent still takes
+   this container into account even when it isn't visible */
+::content .node:not(.highlighted):not(.expanded) > .nodeshape > .buttoncontainer {
+  opacity: 0.01;
+}
+::content .node.highlighted > .nodeshape > .buttoncontainer {
+  cursor: pointer;
+}
+::content .buttoncircle {
+  fill: #E7811D;
+}
+::content .buttoncircle:hover {
+  fill: #B96717;
+}
+::content .expandbutton,
+::content .collapsebutton {
+  stroke: white;
+}
+/* Do not let the path elements in the button take pointer focus */
+::content .node > .nodeshape > .buttoncontainer > .expandbutton,
+::content .node > .nodeshape > .buttoncontainer > .collapsebutton {
+  pointer-events: none;
+}
+/* Only show the expand button when a node is collapsed and only show the
+   collapse button when a node is expanded. */
+::content .node.expanded > .nodeshape > .buttoncontainer > .expandbutton {
+  display: none;
+}
+::content .node:not(.expanded) > .nodeshape > .buttoncontainer > .collapsebutton {
+  display: none;
+}
+
+::content .health-pill-stats {
+  font-size: 4px;
+  text-anchor: middle;
+}
+
+::content .health-pill rect {
+  filter: url(#health-pill-shadow);
+  rx: 3;
+  ry: 3;
+}
+
+.titleContainer {
+  position: relative;
+  top: 20px;
+}
+
+.title {
+  position: absolute;
+}
+
+.auxTitle {
+  position: absolute;
+}
+
+#minimap {
+  position: absolute;
+  right: 20px;
+  bottom: 20px;
+}
+</style>
+<div class="titleContainer">
+  <div id="title" class="title">Main Graph</div>
+  <div id="auxTitle" class="auxTitle">Auxiliary Nodes</div>
+</div>
+<svg id="svg">
+  <defs>
+
+    <!-- Arrow heads for edge paths of different predefined sizes. -->
+    <path id="ref-arrowhead-path" d="M 10,0 L 0,5 L 10,10 C 7,7 7,3 10,0"/>
+    <marker class="ref-arrowhead" id="ref-arrowhead-small" viewBox="0 0 10 10" markerWidth="10" markerHeight="10"
+      refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
+      <use xlink:href="#ref-arrowhead-path" />
+    </marker>
+    <marker class="ref-arrowhead" id="ref-arrowhead-medium" viewBox="0 0 10 10" markerWidth="13" markerHeight="13"
+        refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
+      <use xlink:href="#ref-arrowhead-path" />
+    </marker>
+    <marker class="ref-arrowhead" id="ref-arrowhead-large" viewBox="0 0 10 10" markerWidth="16" markerHeight="16"
+        refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
+      <use xlink:href="#ref-arrowhead-path" />
+    </marker>
+    <marker class="ref-arrowhead" id="ref-arrowhead-xlarge" viewBox="0 0 10 10" markerWidth="20" markerHeight="20"
+        refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
+      <use xlink:href="#ref-arrowhead-path" />
+    </marker>
+
+    <!-- Arrow head for annotation edge paths. -->
+    <marker id="annotation-arrowhead" markerWidth="5" markerHeight="5"
+      refX="5" refY="2.5" orient="auto">
+      <path d="M 0,0 L 5,2.5 L 0,5 L 0,0"/>
+    </marker>
+    <marker id="annotation-arrowhead-faded" markerWidth="5" markerHeight="5"
+      refX="5" refY="2.5" orient="auto">
+      <path d="M 0,0 L 5,2.5 L 0,5 L 0,0"/>
+    </marker>
+    <marker id="ref-annotation-arrowhead" markerWidth="5" markerHeight="5"
+      refX="0" refY="2.5" orient="auto">
+      <path d="M 5,0 L 0,2.5 L 5,5 L 5,0"/>
+    </marker>
+    <marker id="ref-annotation-arrowhead-faded" markerWidth="5" markerHeight="5"
+      refX="0" refY="2.5" orient="auto">
+      <path d="M 5,0 L 0,2.5 L 5,5 L 5,0"/>
+    </marker>
+    <!-- Template for an Op node ellipse. -->
+    <ellipse id="op-node-stamp"
+        rx="7.5" ry="3" stroke="inherit" fill="inherit" />
+    <!-- Template for an Op node annotation ellipse (smaller). -->
+    <ellipse id="op-node-annotation-stamp"
+        rx="5" ry="2" stroke="inherit" fill="inherit" />
+    <!-- Vertically stacked series of Op nodes when unexpanded. -->
+    <g id="op-series-vertical-stamp">
+      <use xlink:href="#op-node-stamp" x="8" y="9" />
+      <use xlink:href="#op-node-stamp" x="8" y="6" />
+      <use xlink:href="#op-node-stamp" x="8" y="3" />
+    </g>
+    <!-- Horizontally stacked series of Op nodes when unexpanded. -->
+    <g id="op-series-horizontal-stamp">
+      <use xlink:href="#op-node-stamp" x="16" y="4" />
+      <use xlink:href="#op-node-stamp" x="12" y="4" />
+      <use xlink:href="#op-node-stamp" x="8" y="4" />
+    </g>
+    <!-- Horizontally stacked series of Op nodes for annotation. -->
+    <g id="op-series-annotation-stamp">
+      <use xlink:href="#op-node-annotation-stamp" x="9" y="2" />
+      <use xlink:href="#op-node-annotation-stamp" x="7" y="2" />
+      <use xlink:href="#op-node-annotation-stamp" x="5" y="2" />
+    </g>
+    <svg id="summary-icon" fill="#848484" height="12" viewBox="0 0 24 24" width="12">
+      <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z" />
+    </svg>
+    <!--
+      Where the linearGradient for each node is stored. Used when coloring
+      by proportions of devices.
+    -->
+    <g id="linearGradients"></g>
+
+    <!-- Hatch patterns for faded out nodes. -->
+    <pattern id="rectHatch" patternTransform="rotate(45 0 0)" width="5" height="5" patternUnits="userSpaceOnUse">
+      <line x1="0" y1="0" x2="0" y2="5" style="stroke-width: 1"/>
+    </pattern>
+    <pattern id="ellipseHatch" patternTransform="rotate(45 0 0)" width="2" height="2" patternUnits="userSpaceOnUse">
+      <line x1="0" y1="0" x2="0" y2="2" style="stroke-width: 1"/>
+    </pattern>
+
+    <!-- A shadow for health pills. -->
+    <filter id="health-pill-shadow" x="-40%" y="-40%" width="180%" height="180%">
+      <feGaussianBlur in="SourceAlpha" stdDeviation="0.8"/>
+      <feOffset dx="0" dy="0" result="offsetblur"/>
+      <feFlood flood-color="#000000"/>
+      <feComposite in2="offsetblur" operator="in"/>
+      <feMerge>
+        <feMergeNode/>
+        <feMergeNode in="SourceGraphic"/>
+      </feMerge>
+    </filter>
+  </defs>
+  <!-- Make a large rectangle that fills the svg space so that
+  zoom events get captured on safari -->
+  <rect fill="white" width="10000" height="10000"></rect>
+  <g id="root"></g>
+</svg>
+<tf-graph-minimap id="minimap"></tf-graph-minimap>
+</template>
+<script>
+Polymer({
+  is: 'tf-graph-scene',
+  properties: {
+    renderHierarchy: Object,
+    name: String,
+    colorBy: String,
+
+    // For each render hierarchy, we only fit it to the viewport once (when the scene is attached to
+    // the DOM). We do not fit the hierarchy again (unless the user clicks the reset button). For
+    // instance, if the user enters a certain view in the graph, switches to another dashboard, and
+    // returns to the graph dashboard, the user expects the previous view. These properties enable
+    // that behavior.
+
+    /** Whether the scene has fit the current render hierarchy (to the viewport) at least once. */
+    _hasRenderHierarchyBeenFitOnce: Boolean,
+    /** Whether this scene element is currently attached to a parent element. */
+    _isAttached: Boolean,
+
+    /** @type {d3_zoom} d3 zoom object */
+    _zoom: Object,
+    highlightedNode: {
+      type: String,
+      observer: '_highlightedNodeChanged'
+    },
+    selectedNode: {
+      type: String,
+      observer: '_selectedNodeChanged'
+    },
+    /** Keeps track of if the graph has been zoomed/panned since loading */
+    _zoomed: {
+      type: Boolean,
+      observer: '_onZoomChanged',
+      value: false
+    },
+    /** Keeps track of the starting coordinates of a graph zoom/pan */
+    _zoomStartCoords: {
+      type: Object,
+      value: null
+    },
+    /** Keeps track of the current coordinates of a graph zoom/pan */
+    _zoomTransform: {
+      type: Object,
+      value: null
+    },
+    /** Maximum distance of a zoom event for it to be interpreted as a click */
+    _maxZoomDistanceForClick: {
+      type: Number,
+      value: 20
+    },
+    /**
+     * @type {d3.scale.ordinal}
+     * Scale mapping from template name to a number between 0 and N-1
+     * where N is the number of different template names. Used by
+     * tf.graph.scene.node when computing node color by structure.
+     */
+    templateIndex: Function,
+    /**
+     * @type {tf.scene.Minimap}
+     * A minimap object to notify for zoom events.
+     */
+    minimap: Object,
+    /*
+     * Dictionary for easily stylizing nodes when state changes.
+     * _nodeGroupIndex[nodeName] = d3_selection of the nodeGroup
+     */
+    _nodeGroupIndex: {
+      type: Object,
+      value: function() { return {}; }
+    },
+    /*
+     * Dictionary for easily stylizing annotation nodes when state changes.
+     * _annotationGroupIndex[nodeName][hostNodeName] =
+     *   d3_selection of the annotationGroup
+     */
+    _annotationGroupIndex: {
+      type: Object,
+      value: function() { return {}; }
+    },
+    /*
+     * Dictionary for easily stylizing edges when state changes.
+     * _edgeGroupIndex[edgeName] = d3_selection of the edgeGroup
+     */
+    _edgeGroupIndex: {
+      type: Object,
+      value: function() { return {}; }
+    },
+    /**
+     * Max font size for metanode label strings.
+     */
+    maxMetanodeLabelLengthFontSize: {
+      type: Number,
+      value: 9
+    },
+    /**
+     * Min font size for metanode label strings.
+     */
+    minMetanodeLabelLengthFontSize: {
+      type: Number,
+      value: 6
+    },
+    /**
+     * Metanode label strings longer than this are given smaller fonts.
+     */
+    maxMetanodeLabelLengthLargeFont: {
+      type: Number,
+      value: 11
+    },
+    /**
+     * Metanode label strings longer than this are truncated with ellipses.
+     */
+    maxMetanodeLabelLength: {
+      type: Number,
+      value: 18
+    },
+    progress: Object,
+    // A mapping between node name to the tf.graph.scene.HealthPill to render.
+    nodeNamesToHealthPills: Object,
+    // The step of health pills to show throughout the graph.
+    healthPillStepIndex: Number,
+  },
+  observers: [
+    '_colorByChanged(colorBy)',
+    '_renderHierarchyChanged(renderHierarchy)',
+    // Animation and fitting must come after the observer for the hierarchy changing because we must
+    // first build the render hierarchy.
+    '_animateAndFit(_isAttached, renderHierarchy)',
+    '_updateHealthPills(nodeNamesToHealthPills, healthPillStepIndex)',
+  ],
+  getNode: function(nodeName) {
+    return this.renderHierarchy.getRenderNodeByName(nodeName);
+  },
+  isNodeExpanded: function(node) {
+    return node.expanded;
+  },
+  setNodeExpanded: function(renderNode) {
+    this._build(this.renderHierarchy);
+    this._updateLabels(!this._zoomed);
+  },
+  /**
+   * Resets the state of the component. Called whenever the whole graph
+   * (dataset) changes.
+   */
+  _resetState: function() {
+    // Reset the state of the component.
+    this._nodeGroupIndex = {};
+    this._annotationGroupIndex = {};
+    this._edgeGroupIndex = {};
+    this._updateLabels(false);
+    // Remove all svg elements under the 'root' svg group.
+    d3.select(this.$.svg).select('#root').selectAll('*').remove();
+    // And the defs.
+    d3.select(this.$.svg).select('defs #linearGradients')
+        .selectAll('*').remove();
+  },
+  /** Main method for building the scene */
+  _build: function(renderHierarchy) {
+    this.templateIndex = renderHierarchy.hierarchy.getTemplateIndex();
+    tf.graph.util.time('tf-graph-scene (layout):', function() {
+      // layout the scene for this meta / series node
+      tf.graph.layout.layoutScene(renderHierarchy.root, this);
+    }.bind(this));
+
+    tf.graph.util.time('tf-graph-scene (build scene):', function() {
+      tf.graph.scene.buildGroup(d3.select(this.$.root), renderHierarchy.root, this);
+      tf.graph.scene.addGraphClickListener(this.$.svg, this);
+      tf.graph.scene.node.traceInputs(renderHierarchy);
+    }.bind(this));
+    // Update the minimap again when the graph is done animating.
+    setTimeout(function() {
+      this._updateHealthPills(this.nodeNamesToHealthPills, this.healthPillStepIndex);
+      this.minimap.update();
+    }.bind(this), tf.graph.layout.PARAMS.animation.duration);
+  },
+  ready: function() {
+    this._zoom = d3.zoom()
+      .on('end', function() {
+        if (this._zoomStartCoords) {
+          // Calculate the total distance dragged during the zoom event.
+          // If it is sufficiently small, then fire an event indicating
+          // that zooming has ended. Otherwise wait to fire the zoom end
+          // event, so that a mouse click registered as part of this zooming
+          // is ignored (as this mouse click was part of a zooming, and should
+          // not be used to indicate an actual click on the graph).
+          var dragDistance = Math.sqrt(
+            Math.pow(this._zoomStartCoords.x - this._zoomTransform.x, 2) +
+            Math.pow(this._zoomStartCoords.y - this._zoomTransform.y, 2));
+          if (dragDistance < this._maxZoomDistanceForClick) {
+            this._fireEnableClick();
+          } else {
+            setTimeout(this._fireEnableClick.bind(this), 50);
+          }
+        }
+        this._zoomStartCoords = null;
+      }.bind(this))
+      .on('zoom', function() {
+        // Store the coordinates of the zoom event.
+        this._zoomTransform = d3.event.transform;
+
+        // If this is the first zoom event after a zoom-end, then
+        // store the coordinates as the start coordinates as well,
+        // and fire an event to indicate that zooming has started.
+        // This doesn't use the zoomstart event, as d3 sends this
+        // event on mouse-down, even if there has been no dragging
+        // done to translate the graph around.
+        if (!this._zoomStartCoords) {
+          this._zoomStartCoords = this._zoomTransform;
+          this.fire('disable-click');
+        }
+        this._zoomed = true;
+        d3.select(this.$.root).attr('transform', d3.event.transform);
+        // Notify the minimap.
+        this.minimap.zoom(d3.event.transform);
+      }.bind(this));
+    d3.select(this.$.svg).call(this._zoom)
+      .on('dblclick.zoom', null);
+    d3.select(window).on('resize', function() {
+      // Notify the minimap that the user's window was resized.
+      // The minimap will figure out the new dimensions of the main svg
+      // and will use the existing translate and scale params.
+      this.minimap.zoom();
+    }.bind(this));
+    // Initialize the minimap.
+    this.minimap = this.$.minimap.init(this.$.svg, this.$.root, this._zoom,
+        tf.graph.layout.PARAMS.minimap.size,
+        tf.graph.layout.PARAMS.subscene.meta.labelHeight);
+  },
+  attached: function() {
+    this.set('_isAttached', true);
+  },
+  detached: function() {
+    this.set('_isAttached', false);
+  },
+  _renderHierarchyChanged: function(renderHierarchy) {
+    this._hasRenderHierarchyBeenFitOnce = false;
+    this._resetState();
+    this._build(renderHierarchy);
+  },
+  _animateAndFit: function(isAttached, renderHierarchy) {
+    if (this._hasRenderHierarchyBeenFitOnce || !isAttached) {
+      // Do not animate and fit if the scene has already fitted this render hierarchy once. Or if
+      // the graph dashboard is not attached (in which case the scene lacks DOM info for fitting).
+      return;
+    }
+
+    // Fit to screen after the graph is done animating.
+    setTimeout(this.fit.bind(this), tf.graph.layout.PARAMS.animation.duration);
+  },
+  _updateLabels: function(showLabels) {
+    var mainGraphTitleElement = this.getElementsByClassName('title')[0];
+    var titleStyle = mainGraphTitleElement.style;
+    var auxTitleStyle = this.getElementsByClassName('auxTitle')[0].style;
+    var core = d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
+      tf.graph.scene.Class.Scene.CORE).node();
+    // Only show labels if the graph is fully loaded.
+    if (showLabels && core && this.progress && this.progress.value === 100) {
+      var aux =
+        d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
+          tf.graph.scene.Class.Scene.INEXTRACT).node() ||
+        d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
+          tf.graph.scene.Class.Scene.OUTEXTRACT).node();
+      var coreX = core.getCTM().e;
+      var auxX = aux ? aux.getCTM().e : null;
+      titleStyle.display = 'inline';
+      titleStyle.left = coreX + 'px';
+      if (auxX !== null && auxX !== coreX) {
+        auxTitleStyle.display = 'inline';
+
+        // Make sure that the aux title is positioned rightwards enough so as to
+        // prevent overlap with the main graph title.
+        auxX = Math.max(
+            coreX + mainGraphTitleElement.getBoundingClientRect().width, auxX);
+
+        auxTitleStyle.left = auxX + 'px';
+      } else {
+        auxTitleStyle.display = 'none';
+      }
+    } else {
+      titleStyle.display='none';
+      auxTitleStyle.display = 'none';
+    }
+  },
+  /**
+    * Called whenever the user changed the 'color by' option in the
+    * UI controls.
+    */
+  _colorByChanged: function() {
+    if (this.renderHierarchy != null) {
+      // We iterate through each svg node and update its state.
+      _.each(this._nodeGroupIndex, function(nodeGroup, nodeName) {
+        this._updateNodeState(nodeName);
+      }, this);
+      // Notify also the minimap.
+      this.minimap.update();
+    }
+  },
+  fit: function() {
+    this._hasRenderHierarchyBeenFitOnce = true;
+    tf.graph.scene.fit(this.$.svg, this.$.root, this._zoom, function() {
+      this._zoomed = false;
+    }.bind(this));
+  },
+  isNodeSelected: function(n) {
+    return n === this.selectedNode;
+  },
+  isNodeHighlighted: function(n) {
+    return n === this.highlightedNode;
+  },
+  addAnnotationGroup: function(a, d, selection) {
+    var an = a.node.name;
+    this._annotationGroupIndex[an] = this._annotationGroupIndex[an] || {};
+    this._annotationGroupIndex[an][d.node.name] = selection;
+  },
+  getAnnotationGroupsIndex: function(a) {
+    return this._annotationGroupIndex[a];
+  },
+  removeAnnotationGroup: function(a, d) {
+    delete this._annotationGroupIndex[a.node.name][d.node.name];
+  },
+  addNodeGroup: function(n, selection) {
+    this._nodeGroupIndex[n] = selection;
+  },
+  getNodeGroup: function(n) {
+    return this._nodeGroupIndex[n];
+  },
+  removeNodeGroup: function(n) {
+    delete this._nodeGroupIndex[n];
+  },
+  addEdgeGroup: function(n, selection) {
+    this._edgeGroupIndex[e] = selection;
+  },
+  getEdgeGroup: function(e) {
+    return this._edgeGroupIndex[e];
+  },
+  _updateHealthPills: function(nodeNamesToHealthPills, healthPillStepIndex) {
+    tf.graph.scene.addHealthPills(
+        this.$.svg, nodeNamesToHealthPills, healthPillStepIndex);
+  },
+  /**
+   * Update node and annotation node of the given name.
+   * @param  {String} n node name
+   */
+  _updateNodeState: function(n) {
+    var node = this.getNode(n);
+    var nodeGroup = this.getNodeGroup(n);
+
+    if (nodeGroup) {
+      tf.graph.scene.node.stylize(nodeGroup, node, this);
+    }
+
+    var annotationGroupIndex = this.getAnnotationGroupsIndex(n);
+    _.each(annotationGroupIndex, function(aGroup, hostName) {
+      tf.graph.scene.node.stylize(aGroup, node, this,
+          tf.graph.scene.Class.Annotation.NODE);
+    }, this);
+  },
+
+  /**
+   * Handles new node selection. 1) Updates the selected-state of each node,
+   * 2) triggers input tracing.
+   * @param selectedNode {string} The name of the newly selected node.
+   * @param oldSelectedNode {string} The name of the previously selected node.
+   * @private
+   */
+  _selectedNodeChanged: function(selectedNode, oldSelectedNode) {
+    if (selectedNode === oldSelectedNode) {
+      return;
+    }
+
+    if (selectedNode) {
+      this._updateNodeState(selectedNode);
+    }
+    if (oldSelectedNode) {
+      this._updateNodeState(oldSelectedNode);
+    }
+
+    tf.graph.scene.node.traceInputs(this.renderHierarchy);
+
+    if (!selectedNode) {
+      return;
+    }
+
+
+    // Update the minimap to reflect the highlighted (selected) node.
+    this.minimap.update();
+    var node = this.renderHierarchy.hierarchy.node(selectedNode);
+    var nodeParents = [];
+    // Create list of all metanode parents of the selected node.
+    while (node.parentNode != null
+        && node.parentNode.name != tf.graph.ROOT_NAME) {
+      node = node.parentNode;
+      nodeParents.push(node.name);
+    }
+    // Ensure each parent metanode is built and expanded.
+    var topParentNodeToBeExpanded;
+    _.forEachRight(nodeParents, function(parentName) {
+      this.renderHierarchy.buildSubhierarchy(parentName);
+      var renderNode = this.renderHierarchy.getRenderNodeByName(parentName);
+      if (renderNode.node.isGroupNode && !renderNode.expanded) {
+        renderNode.expanded = true;
+        if (!topParentNodeToBeExpanded) {
+          topParentNodeToBeExpanded = renderNode;
+        }
+      }
+    }, this);
+    // If any expansion was needed to display this selected node, then
+    // inform the scene of the top-most expansion.
+    if (topParentNodeToBeExpanded) {
+      this.setNodeExpanded(topParentNodeToBeExpanded);
+      this._zoomed = true;
+    }
+
+    if (tf.graph.scene.panToNode(selectedNode, this.$.svg, this.$.root,
+        this._zoom)) {
+      this._zoomed = true;
+    }
+  },
+  _highlightedNodeChanged: function(highlightedNode, oldHighlightedNode) {
+    if (highlightedNode === oldHighlightedNode) {
+      return;
+    }
+
+    if (highlightedNode) {
+      this._updateNodeState(highlightedNode);
+    }
+    if (oldHighlightedNode) {
+      this._updateNodeState(oldHighlightedNode);
+    }
+  },
+  _onZoomChanged: function() {
+    this._updateLabels(!this._zoomed);
+  },
+  _fireEnableClick: function() {
+    this.fire('enable-click');
+  },
+});
+</script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph.html b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph.html
new file mode 100644
index 0000000000000000000000000000000000000000..efbf065a40ac80d3a45f6fe304841c98ed51a02b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph.html
@@ -0,0 +1,316 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../iron-flex-layout/iron-flex-layout.html">
+<link rel="import" href="../iron-icons/iron-icons.html">
+<link rel="import" href="../paper-button/paper-button.html">
+<link rel="import" href="../paper-input/paper-input.html">
+<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="tf-graph-scene.html">
+
+<dom-module id="tf-graph">
+<template>
+<style>
+.container {
+  width: 100%;
+  height: 100%;
+  background: white;
+  box-shadow: 0 1px 5px rgba(0,0,0,0.2);
+}
+
+.vertical {
+  width:100%;
+  height:100%;
+  @apply(--layout-vertical);
+}
+
+.auto {
+  @apply(--layout-flex-auto);
+  @apply(--layout-vertical);
+}
+
+h2 {
+  text-align: center;
+}
+
+paper-button {
+  text-transform: none;
+}
+</style>
+<div class="container">
+  <div class="vertical">
+    <template is="dom-if" if="[[title]]">
+      <h2>[[title]]</h2>
+    </template>
+    <tf-graph-scene id="scene" class="auto"
+          render-hierarchy="[[renderHierarchy]]"
+          highlighted-node="[[_getVisible(highlightedNode)]]"
+          selected-node="{{selectedNode}}"
+          color-by="[[colorBy]]"
+          progress="[[progress]]"
+          node-names-to-health-pills="[[nodeNamesToHealthPills]]"
+          health-pill-step-index="{{healthPillStepIndex}}"
+    ></tf-graph-scene>
+  </div>
+</div>
+</template>
+</dom-module>
+
+<script>
+Polymer({
+
+  is: 'tf-graph',
+
+  properties: {
+    graphHierarchy: {
+      type: Object,
+      notify: true,
+      observer: '_graphChanged'
+    },
+    basicGraph: Object,
+    stats: Object,
+    devicesForStats: Object,
+    hierarchyParams: Object,
+    progress: {
+      type: Object,
+      notify: true,
+    },
+    title: String,
+    selectedNode: {
+      type: String,
+      notify: true,
+    },
+    highlightedNode: {
+      type: String,
+      notify: true
+    },
+    /** What to color the nodes by (compute time, memory, device etc.) */
+    colorBy: String,
+    colorByParams: {
+      type: Object,
+      notify: true,
+      readOnly: true, // Produces and doesn't consume.
+    },
+    renderHierarchy: {
+      type: Object,
+      readOnly: true,
+      notify: true,
+    },
+    _renderDepth: {
+      type: Number,
+      value: 1
+    },
+    _allowGraphSelect: {
+      type: Boolean,
+      value: true
+    },
+    // A mapping between node name to the tf.graph.scene.HealthPill to render.
+    nodeNamesToHealthPills: Object,
+    // The step of health pills to show throughout the graph.
+    healthPillStepIndex: Number,
+  },
+  observers: [
+    '_statsChanged(stats, devicesForStats)',
+    '_buildRenderHierarchy(graphHierarchy)'
+  ],
+  _statsChanged: function(stats, devicesForStats) {
+    if (this.graphHierarchy) {
+      if (stats && devicesForStats) {
+        tf.graph.joinStatsInfoWithGraph(this.basicGraph, stats, devicesForStats);
+        tf.graph.hierarchy.joinAndAggregateStats(this.graphHierarchy, stats);
+      }
+      // Recompute the rendering information.
+      this._buildRenderHierarchy(this.graphHierarchy);
+    }
+  },
+  _buildRenderHierarchy: function(graphHierarchy) {
+    tf.graph.util.time('new tf.graph.render.Hierarchy', function() {
+      if (graphHierarchy.root.type !== tf.graph.NodeType.META) {
+        // root must be metanode but sometimes Polymer's dom-if has not
+        // remove tf-graph element yet in <tf-node-info>
+        // and thus mistakenly pass non-metanode to this module.
+        return;
+      }
+      var renderGraph = new tf.graph.render.RenderGraphInfo(
+          graphHierarchy, !!this.stats /** displayingStats */);
+      // Producing the 'color by' parameters to be consumed
+      // by the tf-graph-controls panel. It contains information about the
+      // min and max values and their respective colors, as well as list
+      // of devices with their respective colors.
+
+      function getColorParamsFromScale(scale) {
+        return {
+          minValue: scale.domain()[0],
+          maxValue: scale.domain()[1],
+          startColor: scale.range()[0],
+          endColor: scale.range()[1]
+        };
+      }
+
+      this._setColorByParams({
+        compute_time: getColorParamsFromScale(renderGraph.computeTimeScale),
+        memory: getColorParamsFromScale(renderGraph.memoryUsageScale),
+        device: _.map(renderGraph.deviceColorMap.domain(),
+            function(deviceName) {
+          return {
+            device: deviceName,
+            color: renderGraph.deviceColorMap(deviceName)
+          };
+        }),
+        xla_cluster: _.map(renderGraph.xlaClusterColorMap.domain(),
+            function(xlaClusterName) {
+          return {
+            xla_cluster: xlaClusterName,
+            color: renderGraph.xlaClusterColorMap(xlaClusterName)
+          };
+        }),
+      });
+      this._setRenderHierarchy(renderGraph);
+      this.async(function() {
+        this.fire("rendered");
+      });
+    }.bind(this));
+  },
+  _getVisible: function(name) {
+    if (!name) {
+      return name;
+    }
+    return this.renderHierarchy.getNearestVisibleAncestor(name);
+  },
+  listeners: {
+    'graph-select': '_graphSelected',
+    'disable-click': '_disableClick',
+    'enable-click': '_enableClick',
+    // Nodes
+    'node-toggle-expand': '_nodeToggleExpand',
+    'node-select': '_nodeSelected',
+    'node-highlight': '_nodeHighlighted',
+    'node-unhighlight': '_nodeUnhighlighted',
+    'node-toggle-extract': '_nodeToggleExtract',
+    'node-toggle-seriesgroup': '_nodeToggleSeriesGroup',
+
+    // Annotations
+
+    /* Note: currently highlighting/selecting annotation node has the same
+      * behavior as highlighting/selecting actual node so we point to the same
+      * set of event listeners.  However, we might redesign this to be a bit
+      * different.
+      */
+    'annotation-select': '_nodeSelected',
+    'annotation-highlight': '_nodeHighlighted',
+    'annotation-unhighlight': '_nodeUnhighlighted',
+  },
+  _graphChanged: function() {
+    // When a new graph is loaded, fire this event so that there is no
+    // info-card being displayed for the previously-loaded graph.
+    this.fire('graph-select');
+  },
+  _graphSelected: function(event) {
+    // Graph selection is not allowed during an active zoom event, as the
+    // click seen during a zoom/pan is part of the zooming and does not
+    // indicate a user desire to click on a specific section of the graph.
+    if (this._allowGraphSelect) {
+      this.set('selectedNode', null);
+    }
+    // Reset this variable as a bug in d3 zoom behavior can cause zoomend
+    // callback not to be called if a right-click happens during a zoom event.
+    this._allowGraphSelect = true;
+  },
+  _disableClick: function(event) {
+    this._allowGraphSelect = false;
+  },
+  _enableClick: function(event) {
+    this._allowGraphSelect = true;
+  },
+  _nodeSelected: function(event) {
+    if (this._allowGraphSelect) {
+      this.set('selectedNode', event.detail.name);
+    }
+    // Reset this variable as a bug in d3 zoom behavior can cause zoomend
+    // callback not to be called if a right-click happens during a zoom event.
+    this._allowGraphSelect = true;
+  },
+  _nodeHighlighted: function(event) {
+    this.set('highlightedNode', event.detail.name);
+  },
+  _nodeUnhighlighted: function(event) {
+    this.set('highlightedNode', null);
+  },
+  _nodeToggleExpand: function(event) {
+    // Immediately select the node that is about to be expanded.
+    this._nodeSelected(event);
+
+    // Compute the sub-hierarchy scene.
+    var nodeName = event.detail.name;
+    var renderNode = this.renderHierarchy.getRenderNodeByName(nodeName);
+    // Op nodes are not expandable.
+    if (renderNode.node.type === tf.graph.NodeType.OP) {
+      return;
+    }
+    this.renderHierarchy.buildSubhierarchy(nodeName);
+    renderNode.expanded = !renderNode.expanded;
+
+    // Expand the node with some delay so that the user can immediately see
+    // the visual effect of selecting that node, before the expansion is
+    // done.
+    this.async(function() {
+      this.querySelector('#scene').setNodeExpanded(renderNode);
+    }, 75);
+  },
+  _nodeToggleExtract: function(event) {
+    // Toggle the include setting of the specified node appropriately.
+    var nodeName = event.detail.name;
+    var renderNode = this.renderHierarchy.getRenderNodeByName(nodeName);
+    if (renderNode.node.include == tf.graph.InclusionType.INCLUDE) {
+      renderNode.node.include = tf.graph.InclusionType.EXCLUDE;
+    } else if (renderNode.node.include == tf.graph.InclusionType.EXCLUDE) {
+      renderNode.node.include = tf.graph.InclusionType.INCLUDE;
+    } else {
+      renderNode.node.include =
+       this.renderHierarchy.isNodeAuxiliary(renderNode)
+          ? tf.graph.InclusionType.INCLUDE : tf.graph.InclusionType.EXCLUDE;
+    }
+
+    // Rebuild the render hierarchy.
+    this._buildRenderHierarchy(this.graphHierarchy);
+  },
+  _nodeToggleSeriesGroup: function(event) {
+    // Toggle the group setting of the specified node appropriately.
+    var nodeName = event.detail.name;
+    tf.graph.toggleNodeSeriesGroup(this.hierarchyParams.seriesMap, nodeName);
+
+    // Rebuild the render hierarchy with the updated series grouping map.
+    this.set('progress', {
+      value: 0,
+      msg: ''
+    });
+    var tracker = tf.graph.util.getTracker(this);
+    var hierarchyTracker = tf.graph.util.getSubtaskTracker(tracker, 100,
+          'Namespace hierarchy');
+    tf.graph.hierarchy.build(this.basicGraph, this.hierarchyParams, hierarchyTracker)
+    .then(function(graphHierarchy) {
+      this.set('graphHierarchy', graphHierarchy);
+      this._buildRenderHierarchy(this.graphHierarchy);
+    }.bind(this));
+  },
+  not: function(x) {
+    return !x;
+  }
+});
+</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de8f744d058365c37a63ce5742f3e789391e8c6e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/BUILD
@@ -0,0 +1,57 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_dashboard",
+    srcs = [
+        "tf-graph-dashboard.html",
+    ],
+    path = "/tf-graph-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph",
+        "//tensorflow/tensorboard/components/tf_graph_board",
+        "//tensorflow/tensorboard/components/tf_graph_controls",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-dashboard.html",
+    ],
+    destdir = "tf-graph-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_board:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_controls:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_loader:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3658f45b153c365cc49499c7906ebcb3739a0854
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_dashboard/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_dashboard",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/graph_run_run1.pbtxt b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/graph_run_run1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/graph_run_run1.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/runs.json
new file mode 100644
index 0000000000000000000000000000000000000000..0429aa71f8271a291450f898e2a4b73da738b267
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/runs.json
@@ -0,0 +1,6 @@
+{
+  "run1": {
+    "graph": true,
+    "scalars": ["foo/sin"]
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..67756cc1298a15818263b1825b3d8a381b38ac7a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/index.html
@@ -0,0 +1,56 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../tf-graph-dashboard.html">
+<link rel="import" href="../../paper-styles/typography.html">
+
+<title>Graph Dashboard Demo</title>
+<style>
+  #demo-container {
+    display: block;
+    height: 900px;
+    position: relative;
+    width: 100%;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="graph-dashboard-demo">
+      <template>
+        <tf-graph-dashboard backend="[[backend]]"></tf-graph-dashboard>
+      </template>
+      <script>
+        Polymer({
+          is: "graph-dashboard-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                var router = new TF.Backend.router("data", true);
+                return new TF.Backend.Backend(router);
+              },
+            },
+          },
+        });
+      </script>
+    </dom-module>
+    <graph-dashboard-demo id="demo-container"></graph-dashboard-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html b/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
index 573c3dfa6021263fd859be9ad59e195e7a71fe79..891905e7c470aae627a03edf263d32ff8ed19c07 100644
--- a/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
@@ -18,7 +18,7 @@ limitations under the License.
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
 <link rel="import" href="../tf-graph-board/tf-graph-board.html">
-<link rel="import" href="../tf-graph/tf-graph-controls.html">
+<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
 <link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
 <link rel="import" href="../tf-backend/tf-backend.html">
 
@@ -103,6 +103,8 @@ out-hierarchy-params="{{_hierarchyParams}}"
 </dom-module>
 
 <script>
+"use strict";
+
 (function() {
 TF.Dashboard.TfGraphDashboard = Polymer({
   is: 'tf-graph-dashboard',
@@ -153,7 +155,7 @@ TF.Dashboard.TfGraphDashboard = Polymer({
     'node-toggle-expand': '_handleNodeToggleExpand',
   },
   observers: [
-    '_maybeFetchHealthPillsAtSpecificStep(allStepsModeEnabled, specificHealthPillStep)',
+    '_maybeFetchHealthPills(allStepsModeEnabled, specificHealthPillStep)',
     '_maybeInitializeDashboard(backend, _isAttached)',
   ],
   attached: function() {
@@ -163,19 +165,16 @@ TF.Dashboard.TfGraphDashboard = Polymer({
     this.set('_isAttached', false);
   },
   reload: function() {
-    if (!this.debuggerDataEnabled ||
-        !this.healthPillsToggledOn ||
-        !this._renderHierarchy ||
-        this._datasetsEmpty(this._datasets)) {
-      // Do not load debugger data if the feature is disabled, if the user toggled off the feature,
-      // or if the graph itself has not loaded yet. We need the graph to load so that we know which
-      // nodes to request health pills for.
-      return;
-    }
-
-    // Request debugger data on graph reloads, but do not re-request the graph itself. The graph
-    // would not change across reloads.
-    this._requestHealthPills();
+    this._maybeFetchHealthPills();
+  },
+  _shouldRequestHealthPills: function() {
+    // Do not load debugger data if the feature is disabled, if the user toggled off the feature,
+    // or if the graph itself has not loaded yet. We need the graph to load so that we know which
+    // nodes to request health pills for.
+    return this.debuggerDataEnabled &&
+        this.healthPillsToggledOn &&
+        this._renderHierarchy &&
+        !this._datasetsEmpty(this._datasets);
   },
   _maybeInitializeDashboard: function(backend, isAttached) {
     if (this._initialized || !backend || !isAttached) {
@@ -277,7 +276,7 @@ TF.Dashboard.TfGraphDashboard = Polymer({
   },
   _handleNodeToggleExpand: function() {
     // Nodes were toggled. We may need to request health pills for more nodes.
-    this._requestHealthPills();
+    this._maybeFetchHealthPills();
   },
   _healthPillsToggledOnChanged: function(healthPillsToggledOn) {
     if (healthPillsToggledOn) {
@@ -289,9 +288,8 @@ TF.Dashboard.TfGraphDashboard = Polymer({
     }
   },
   // Fetch health pills for a specific step if applicable.
-  _maybeFetchHealthPillsAtSpecificStep: function(allStepsModeEnabled, specificHealthPillStep) {
-    if (!this._renderHierarchy) {
-      // The graph is not ready yet.
+  _maybeFetchHealthPills: function() {
+    if (!this._shouldRequestHealthPills()) {
       return;
     }
 
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0cee324f48b4d163225186114c76f89a117fd013
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/BUILD
@@ -0,0 +1,30 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_dashboard_d3v4",
+    srcs = [
+        "tf-graph-dashboard.html",
+    ],
+    path = "/tf-graph-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..74238d78e2f2f97a054d3588abf7d3b08ef02867
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_dashboard/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_dashboard_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/graph_run_run1.pbtxt b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/graph_run_run1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/graph_run_run1.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/runs.json b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/runs.json
new file mode 100644
index 0000000000000000000000000000000000000000..0429aa71f8271a291450f898e2a4b73da738b267
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/runs.json
@@ -0,0 +1,6 @@
+{
+  "run1": {
+    "graph": true,
+    "scalars": ["foo/sin"]
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..67756cc1298a15818263b1825b3d8a381b38ac7a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/index.html
@@ -0,0 +1,56 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../tf-graph-dashboard.html">
+<link rel="import" href="../../paper-styles/typography.html">
+
+<title>Graph Dashboard Demo</title>
+<style>
+  #demo-container {
+    display: block;
+    height: 900px;
+    position: relative;
+    width: 100%;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="graph-dashboard-demo">
+      <template>
+        <tf-graph-dashboard backend="[[backend]]"></tf-graph-dashboard>
+      </template>
+      <script>
+        Polymer({
+          is: "graph-dashboard-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                var router = new TF.Backend.router("data", true);
+                return new TF.Backend.Backend(router);
+              },
+            },
+          },
+        });
+      </script>
+    </dom-module>
+    <graph-dashboard-demo id="demo-container"></graph-dashboard-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/tf-graph-dashboard.html b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/tf-graph-dashboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..891905e7c470aae627a03edf263d32ff8ed19c07
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/tf-graph-dashboard.html
@@ -0,0 +1,300 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../tf-graph-board/tf-graph-board.html">
+<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+
+<!--
+tf-graph-dashboard displays a graph from a TensorFlow run.
+
+It has simple behavior: Creates a url-generator and run-generator
+to talk to the backend, and then passes the runsWithGraph (list of runs with
+associated graphs) along with the url generator into tf-graph-board for display.
+
+If there are multiple runs with graphs, the first run's graph is shown
+by default. The user can select a different run from a dropdown menu.
+-->
+<dom-module id="tf-graph-dashboard">
+<template>
+<tf-no-data-warning
+  data-type="graph"
+  show-warning="[[_datasetsEmpty(_datasets)]]"
+></tf-no-data-warning>
+<template is="dom-if" if="[[!_datasetsEmpty(_datasets)]]">
+<tf-dashboard-layout>
+<div class="sidebar">
+  <tf-graph-controls id="controls"
+        devices-for-stats="{{_devicesForStats}}"
+        color-by-params="[[_colorByParams]]"
+        stats="[[_stats]]"
+        color-by="{{_colorBy}}"
+        datasets="[[_datasets]]"
+        render-hierarchy="[[_renderHierarchy]]"
+        selected-dataset="{{_selectedDataset}}"
+        selected-file="{{_selectedFile}}"
+        selected-metadata-tag="{{_selectedMetadataTag}}"
+        health-pills-feature-enabled="[[debuggerDataEnabled]]"
+        health-pills-toggled-on="{{healthPillsToggledOn}}"
+  ></tf-graph-controls>
+  <tf-graph-loader id="loader"
+        datasets="[[_datasets]]"
+        selected-dataset="[[_selectedDataset]]"
+        selected-metadata-tag="[[_selectedMetadataTag]]"
+        selected-file="[[_selectedFile]]"
+        out-graph-hierarchy="{{_graphHierarchy}}"
+        out-graph="{{_graph}}"
+        out-stats="{{_stats}}"
+        progress="{{_progress}}"
+out-hierarchy-params="{{_hierarchyParams}}"
+  ></tf-graph-loader>
+</div>
+<div class="center">
+    <tf-graph-board id="graphboard"
+        devices-for-stats="[[_devicesForStats]]"
+        color-by="[[_colorBy]]"
+        color-by-params="{{_colorByParams}}"
+        graph-hierarchy="[[_graphHierarchy]]"
+        graph="[[_graph]]"
+        hierarchy-params="[[_hierarchyParams]]"
+        progress="[[_progress]]"
+        debugger-data-enabled="[[debuggerDataEnabled]]"
+        are-health-pills-loading="[[_areHealthPillsLoading]]"
+        node-names-to-health-pills="[[_nodeNamesToHealthPills]]"
+        all-steps-mode-enabled="{{allStepsModeEnabled}}"
+        specific-health-pill-step="{{specificHealthPillStep}}"
+        health-pill-step-index="[[_healthPillStepIndex]]"
+        render-hierarchy="{{_renderHierarchy}}"
+        stats="[[_stats]]"
+    ></tf-graph-board>
+</div>
+</tf-dashboard-layout>
+</template>
+<style>
+
+:host /deep/ {
+  font-family: 'Roboto', sans-serif;
+}
+
+.center {
+  position: relative;
+  height: 100%;
+}
+
+</style>
+</template>
+</dom-module>
+
+<script>
+"use strict";
+
+(function() {
+TF.Dashboard.TfGraphDashboard = Polymer({
+  is: 'tf-graph-dashboard',
+  factoryImpl: function(backend, debuggerDataEnabled) {
+    this.backend = backend;
+    this.debuggerDataEnabled = debuggerDataEnabled;
+  },
+  behaviors: [
+    TF.Dashboard.DashboardBehavior("graphs"),
+    TF.Dashboard.ReloadBehavior("tf-graph-dashboard"),
+    TF.Backend.Behavior,
+  ],
+  properties: {
+    _datasets: Object,
+    _renderHierarchy: {type: Object, observer: '_renderHierarchyChanged'},
+    backend: Object,
+    debuggerDataEnabled: Boolean,
+    allStepsModeEnabled: Boolean,
+    specificHealthPillStep: {type: Number, value: 0},
+    healthPillsToggledOn: {type: Boolean, value: true, observer: '_healthPillsToggledOnChanged'},
+    _isAttached: Boolean,
+    // Whether this dashboard is initialized. This dashboard should only be initialized once.
+    _initialized: Boolean,
+    // Whether health pills are currently being loaded, in which case we may want to say show a
+    // spinner.
+    _areHealthPillsLoading: Boolean,
+    // Maps the names of nodes to an array of health pills (HealthPillDatums).
+    _nodeNamesToHealthPills: {
+      type: Object,
+      value: {},
+    },
+    _healthPillStepIndex: Number,
+    // A strictly increasing ID. Each request for health pills has a unique ID. This helps us
+    // identify stale requests.
+    _healthPillRequestId: {type: Number, value: 1},
+    // The setTimeout ID for the pending request for health pills at a specific step.
+    _healthPillStepRequestTimerId: Number,
+    // The request for health pills at a specific step (as opposed to all sampled health pills) may
+    // involve slow disk reads. Hence, we throttle to 1 of those requests every this many ms.
+    _healthPillStepRequestTimerDelay: {
+      type: Number,
+      value: 500,
+      readOnly: true,
+    },
+    runs: Array,
+  },
+  listeners: {
+    'node-toggle-expand': '_handleNodeToggleExpand',
+  },
+  observers: [
+    '_maybeFetchHealthPills(allStepsModeEnabled, specificHealthPillStep)',
+    '_maybeInitializeDashboard(backend, _isAttached)',
+  ],
+  attached: function() {
+    this.set('_isAttached', true);
+  },
+  detached: function() {
+    this.set('_isAttached', false);
+  },
+  reload: function() {
+    this._maybeFetchHealthPills();
+  },
+  _shouldRequestHealthPills: function() {
+    // Do not load debugger data if the feature is disabled, if the user toggled off the feature,
+    // or if the graph itself has not loaded yet. We need the graph to load so that we know which
+    // nodes to request health pills for.
+    return this.debuggerDataEnabled &&
+        this.healthPillsToggledOn &&
+        this._renderHierarchy &&
+        !this._datasetsEmpty(this._datasets);
+  },
+  _maybeInitializeDashboard: function(backend, isAttached) {
+    if (this._initialized || !backend || !isAttached) {
+      // Either this dashboard is already initialized ... or we are not yet ready to initialize.
+      return;
+    }
+    if (typeof ga !== 'undefined' && ga != null) {
+      ga('send', {hitType: 'pageview', page: '/v/graph'});
+    }
+    // Set this to true so we only initialize once.
+    this._initialized = true;
+    Promise.all([backend.graphRuns(), backend.runMetadataRuns()])
+      .then(function(result) {
+        var runsWithGraph = result[0].sort(VZ.Sorting.compareTagNames);
+        var runToMetadata = result[1];
+        var datasets = _.map(runsWithGraph, function(runName) {
+          return {
+            name: runName,
+            path: backend.router.graph(
+                runName, tf.graph.LIMIT_ATTR_SIZE, tf.graph.LARGE_ATTRS_KEY),
+            runMetadata: runToMetadata[runName] ? _.map(
+              runToMetadata[runName].sort(VZ.Sorting.compareTagNames), function(tag) {
+                return {
+                  tag: tag,
+                  path: backend.router.runMetadata(tag, runName)
+                };
+              }, this) : []
+          };
+        }, this);
+        this.set('_datasets', datasets);
+      }.bind(this));
+  },
+  _requestHealthPills: function() {
+    this.set('_areHealthPillsLoading', true);
+    const requestId = ++this._healthPillRequestId;
+
+    if (this._healthPillStepRequestTimerId !== null) {
+      // A request for health pills is already scheduled to be initiated. Clear it, and schedule a
+      // new request.
+      window.clearTimeout(this._healthPillStepRequestTimerId);
+      this._healthPillStepRequestTimerId = null;
+    }
+
+    if (this.allStepsModeEnabled) {
+      // This path may be slow. Schedule network requests to start some time later. If another
+      // request is scheduled in the mean time, drop this current request.
+      this._healthPillStepRequestTimerId = setTimeout(function() {
+        this._healthPillStepRequestTimerId = null;
+        this._initiateNetworkRequestForHealthPills(requestId);
+      }.bind(this), this._healthPillStepRequestTimerDelay);
+    } else {
+      // The user is fetching sampled steps. This path is fast, so no need to throttle. Directly
+      // fetch the health pills across the network.
+      this._initiateNetworkRequestForHealthPills(requestId);
+    }
+  },
+  // Initiates the network request for health pills. Do not directly call this method - network
+  // requests may be throttled. Instead, call _requestHealthPills, which uses this method.
+  _initiateNetworkRequestForHealthPills: function(requestId) {
+    if (this._healthPillRequestId !== requestId) {
+      // This possibly scheduled request was outdated before it was even sent across the network. Do
+      // not bother initiating it.
+      return;
+    }
+
+    const specificStep = this.allStepsModeEnabled ? this.specificHealthPillStep : undefined;
+    this.backend.healthPills(this._renderHierarchy.getNamesOfRenderedOps(), specificStep).then(
+        function(result) {
+      if (!this.healthPillsToggledOn) {
+        // The user has opted to hide health pills via the toggle button.
+        return;
+      }
+
+      if (requestId !== this._healthPillRequestId) {
+        // This response is no longer relevant.
+        return;
+      }
+
+      // Set the index for which step to show for the health pills. By default, show the last step.
+      // A precondition we assume (that Tensorboard's reservoir sampling guarantees) is that all
+      // node names should be mapped to the same number of steps.
+      for (let nodeName in result) {
+        this.set('_healthPillStepIndex', result[nodeName].length - 1);
+        break;
+      }
+
+      this.set('_nodeNamesToHealthPills', result);
+      this.set('_areHealthPillsLoading', false);
+      this.set('_healthPillStepRequestTimerId', null);
+    }.bind(this));
+  },
+  _datasetsEmpty: function(datasets) {
+    return !datasets || !datasets.length;
+  },
+  _renderHierarchyChanged: function(renderHierarchy) {
+    // Reload any data on the graph when the render hierarchy (which determines which nodes are
+    // rendered) changes.
+    this.reload();
+  },
+  _handleNodeToggleExpand: function() {
+    // Nodes were toggled. We may need to request health pills for more nodes.
+    this._maybeFetchHealthPills();
+  },
+  _healthPillsToggledOnChanged: function(healthPillsToggledOn) {
+    if (healthPillsToggledOn) {
+      // Load health pills.
+      this.reload();
+    } else {
+      // Remove all health pills by setting an empty mapping.
+      this.set('_nodeNamesToHealthPills', {});
+    }
+  },
+  // Fetch health pills for a specific step if applicable.
+  _maybeFetchHealthPills: function() {
+    if (!this._shouldRequestHealthPills()) {
+      return;
+    }
+
+    this._requestHealthPills();
+  },
+});
+})();
+</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_info/BUILD b/tensorflow/tensorboard/components/tf_graph_info/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..cf40db7965b4a6685bfb48e8f5238b86e59d9229
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info/BUILD
@@ -0,0 +1,64 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_info",
+    srcs = [
+        "tf-graph-icon.html",
+        "tf-graph-info.html",
+        "tf-node-info.html",
+        "tf-node-list-item.html",
+    ],
+    path = "/tf-graph-info",
+    suppress = [
+        "strictDependencies",
+        "superfluousSuppress",
+    ],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_list",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-icon.html",
+        "tf-graph-info.html",
+        "tf-node-info.html",
+        "tf-node-list-item.html",
+    ],
+    destdir = "tf-graph-info",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_info/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_info/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a7d59418fd0ff71149a8bdbf04dac3e5d418c154
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_info/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-info/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_info",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_info/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_info/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_info/demo/index.html b/tensorflow/tensorboard/components/tf_graph_info/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..f7d2ef7ee5e56a870b1b49cfff3dd416953f3fa3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info/demo/index.html
@@ -0,0 +1,94 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-info.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Info Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-info-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <tf-graph-info id="info" title="selected"></tf-graph-info>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-info-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Show info.
+              this.$.info.set('graph', slimGraph);
+              this.$.info.set('graphHierarchy', graphHierarchy);
+
+              // Select a node within that graph.
+              this.$.info.set('selectedNode', 'GradientDescent/learning_rate');
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-info-demo></tf-graph-info-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-icon.html b/tensorflow/tensorboard/components/tf_graph_info/tf-graph-icon.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph/tf-graph-icon.html
rename to tensorflow/tensorboard/components/tf_graph_info/tf-graph-icon.html
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html b/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
index 45347fb1de5e91ca8e67e50013588d9c0e91f160..b33e1e00d04e4836322ce0975847aa88f6b0a5d2 100644
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
+++ b/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
@@ -169,6 +169,8 @@ h2 {
 </template>
 </template>
 <script>
+"use strict";
+
 (function() {
   Polymer({
     is: 'tf-graph-info',
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html b/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
index c3e470b123dbe19dcbea366bc6d362affa6a46ac..f1455acaee2b9f9cc7c5ef30c0036b3301f378e3 100644
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
+++ b/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
@@ -19,9 +19,10 @@ limitations under the License.
 <link rel="import" href="../iron-list/iron-list.html">
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-item/all-imports.html">
+<link rel="import" href="../paper-item/paper-item.html">
+<link rel="import" href="../paper-item/paper-item-body.html">
 <link rel="import" href="../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="../tf-graph/tf-graph-icon.html">
+<link rel="import" href="tf-graph-icon.html">
 <link rel="import" href="tf-node-list-item.html">
 
 <dom-module id="tf-node-info">
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html b/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
index 9e9bface5de22709406921ac4008c3781d7e81cd..c15478d126ccbb055a7bbb46f3a29c897321a648 100644
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
+++ b/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
@@ -16,8 +16,8 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-graph/tf-graph-icon.html">
 <link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="tf-graph-icon.html">
 
 <dom-module id="tf-node-list-item">
   <style>
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_info_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f84726e7c7c65905824bc5a49cad8e2c27fd70cb
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/BUILD
@@ -0,0 +1,35 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_info_d3v4",
+    srcs = [
+        "tf-graph-icon.html",
+        "tf-graph-info.html",
+        "tf-node-info.html",
+        "tf-node-list-item.html",
+    ],
+    path = "/tf-graph-info",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_list",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a7d59418fd0ff71149a8bdbf04dac3e5d418c154
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_info/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-info/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_info",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..f7d2ef7ee5e56a870b1b49cfff3dd416953f3fa3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/index.html
@@ -0,0 +1,94 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-info.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Info Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-info-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <tf-graph-info id="info" title="selected"></tf-graph-info>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-info-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Show info.
+              this.$.info.set('graph', slimGraph);
+              this.$.info.set('graphHierarchy', graphHierarchy);
+
+              // Select a node within that graph.
+              this.$.info.set('selectedNode', 'GradientDescent/learning_rate');
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-info-demo></tf-graph-info-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-icon.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-icon.html
new file mode 100644
index 0000000000000000000000000000000000000000..a3e9dc59c5abcb649d07362c1d60edf656c26d67
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-icon.html
@@ -0,0 +1,296 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+
+<dom-module id="tf-graph-icon">
+  <style>
+    .faded-rect {
+      fill: url(#rectHatch);
+    }
+
+    .faded-ellipse {
+      fill: url(#ellipseHatch);
+    }
+
+    .faded-rect, .faded-ellipse, .faded-series {
+      stroke:   var(--tb-graph-faded) !important;
+    }
+  </style>
+  <template>
+    <template is="dom-if" if="[[_isType(node, type, 'OP')]]">
+      <template is="dom-if" if="[[_isConst(node, const)]]">
+        <svg height$="[[height]]"
+            preserveAspectRatio="xMinYMid meet" viewBox="0 0 10 10">
+          <circle cx="5" cy="5" r="3"
+              fill$="[[_getFill(_computedFill, 'OP')]]"
+              stroke$="[[_getStroke(_computedFill, 'OP')]]" />
+        </svg>
+      </template>
+      <template is="dom-if" if="[[_isSummary(node, summary)]]">
+        <svg width$="[[height]]" height$="[[height]]" viewBox="0 0 12 12">
+          <use x="0" y="0" xlink:href="#summary-icon" />
+        </svg>
+      </template>
+      <template is="dom-if" if="[[_isRegularOp(node, const, summary)]]">
+        <svg height$="[[height]]"
+            preserveAspectRatio="xMinYMid meet" viewBox="0 0 16 8">
+          <use xmlns:xlink="http://www.w3.org/1999/xlink"
+              xlink:href="#op-node-stamp"
+              fill$="[[_getFill(_computedFill, 'OP')]]"
+              stroke$="[[_getStroke(_computedFill, 'OP')]]"
+              class$="{{_fadedClass(renderInfo, 'ellipse')}}"
+              x="8" y="4" />
+        </svg>
+      </template>
+    </template>
+    <template is="dom-if" if="[[_isType(node, type, 'META')]]">
+      <svg height$="[[height]]"
+            preserveAspectRatio="xMinYMid meet" viewBox="0 0 37 16">
+        <rect x="1" y="1"
+            fill$="[[_getFill(_computedFill, 'META')]]"
+            stroke$="[[_getStroke(_computedFill, 'META')]]"
+            class$="{{_fadedClass(renderInfo, 'rect')}}"
+            stroke-width="2px"
+            height="14" width="35"
+            rx="5" ry="5"/>
+      </svg>
+    </template>
+    <template is="dom-if" if="[[_isType(node, type, 'SERIES')]]">
+      <template is="dom-if" if="[[_isVertical(node, vertical)]]">
+        <svg height$="[[height]]"
+            preserveAspectRatio="xMinYMid meet" viewBox="0 0 16 15">
+          <use xmlns:xlink="http://www.w3.org/1999/xlink"
+              xlink:href="#op-series-vertical-stamp"
+              fill$="[[_getFill(_computedFill, 'SERIES')]]"
+              stroke$="[[_getStroke(_computedFill, 'SERIES')]]"
+              class$="{{_fadedClass(renderInfo, 'series')}}"
+              x="0" y="2" />
+        </svg>
+      </template>
+      <template is="dom-if" if="[[!_isVertical(node, vertical)]]">
+        <svg height$="[[height]]"
+            preserveAspectRatio="xMinYMid meet" viewBox="0 0 24 10">
+          <use xmlns:xlink="http://www.w3.org/1999/xlink"
+              xlink:href="#op-series-horizontal-stamp"
+              fill$="[[_getFill(_computedFill, 'SERIES')]]"
+              stroke$="[[_getStroke(_computedFill, 'SERIES')]]"
+              class$="{{_fadedClass(renderInfo, 'series')}}"
+              x="0" y="1" />
+        </svg>
+      </template>
+    </template>
+  </template>
+
+  <script>
+    (function() {
+      Polymer({
+        is: 'tf-graph-icon',
+
+        properties: {
+          /**
+           * Node to represent with an icon. Optional, but if specified, its
+           * properties override those defined in the type, vertical, const and
+           * summary properties.
+           * @type {tf.graph.Node}
+           */
+          node: {
+            type: Object,
+            value: null
+          },
+
+          /**
+           * Render node information associated with this node. Optional. If
+           * specified, this is only used when computing the fill of the icon
+           * element.
+           * @type {tf.graph.render.RenderNodeInfo}
+           */
+          renderInfo: {
+            type: Object,
+            value: null
+          },
+
+          /**
+           * String indicating the type of coloring to use for this node, used
+           * only for determining the fill.
+           */
+          colorBy: {
+            type: Object,
+            value: "structural"
+          },
+
+          /**
+           * Function used by structural coloring algorithm to determine which
+           * color to use based on the template ID of the node. Optional.
+           */
+          templateIndex: {
+            type: Function,
+            value: null
+          },
+
+          /** Type of node to draw (ignored if node is set). */
+          type: {
+            type: String,
+            value: null
+          },
+
+          /** Direction for series (ignored for other types). */
+          vertical: {
+            type: Boolean,
+            value: false
+          },
+
+          /** Whether the op is Const (ignored for non-ops). */
+          const: {
+            type: Boolean,
+            value: false
+          },
+
+          /** Whether the op is a Summary (ignored for non-ops). */
+          summary: {
+            type: Boolean,
+            value: false
+          },
+
+          /**
+           * Fill for the icon, optional. If fill is specified and node is not
+           * specified, then this value will override the default for the
+           * element. However, if node is specified, this value will be ignored.
+           */
+          fill: {
+            type: String,
+            value: null
+          },
+
+          /** Height of the SVG element in pixels, used for scaling. */
+          height: {
+            type: Number,
+            value: 20
+          },
+
+          /** The computed fill for the node. **/
+          _computedFill: {
+            type: String,
+            computed:
+              "_getComputedFill(node, renderInfo, colorBy, templateIndex, fill)"
+          }
+
+        },
+
+        /**
+         * Get the computed fill value for the element.
+         */
+        _getComputedFill: function(inputNode, inputRenderInfo, inputColorBy,
+            inputTemplateIndex, inputFill) {
+          if (inputNode && inputRenderInfo &&
+              inputColorBy && inputTemplateIndex) {
+            var ns = tf.graph.scene.node;
+            var colorBy = ns.ColorBy[inputColorBy.toUpperCase()];
+            return ns.getFillForNode(inputTemplateIndex, colorBy,
+                inputRenderInfo, false);
+          }
+          return inputFill;
+        },
+
+        /**
+         * Get the fill value for the element, or if that's not possible, return
+         * the default fill value for the node type.
+         */
+        _getFill: function(inputComputedFill, inputNodeType) {
+          return inputComputedFill || ({
+            OP: tf.graph.render.OpNodeColors.DEFAULT_FILL,
+            META: tf.graph.render.MetanodeColors.DEFAULT_FILL,
+            SERIES: tf.graph.render.SeriesNodeColors.DEFAULT_FILL
+          })[inputNodeType];
+        },
+
+        /**
+         * Get the stroke value for the element, or if that's not possible,
+         * return the default stroke value for the node type.
+         */
+        _getStroke: function(inputComputedFill, inputNodeType) {
+          return inputComputedFill ?
+            tf.graph.scene.node.getStrokeForFill(inputComputedFill) :
+            ({
+              OP: tf.graph.render.OpNodeColors.DEFAULT_STROKE,
+              META: tf.graph.render.MetanodeColors.DEFAULT_STROKE,
+              SERIES: tf.graph.render.SeriesNodeColors.DEFAULT_STROKE
+            })[inputNodeType];
+        },
+
+        /**
+         * Test whether the specified node's type, or the literal type string,
+         * match a particular other type.
+         */
+        _isType: function(inputNode, inputType, targetType) {
+          if (inputNode) {
+            return tf.graph.NodeType[inputNode.type] === targetType;
+          }
+          return inputType === targetType;
+        },
+
+        /**
+         * Test whether the specified node should be represented as a vertical
+         * series. Defaults to the value of the vertical property if node is
+         * not specified.
+         */
+        _isVertical: function(inputNode, inputVertical) {
+          if (inputNode) {
+            return inputNode.hasNonControlEdges;
+          }
+          return !!inputVertical;
+        },
+
+        /**
+         * Test whether the specified node is a constant. Defaults to the value
+         * of the const property if node is not specified.
+         */
+        _isConst: function(inputNode, inputConst) {
+          if (inputNode) {
+            return inputNode.op === 'Const';
+          }
+          return !!inputConst;
+        },
+
+        /**
+         * Test whether the specified node is a summary. Defaults to the value
+         * of the summary property if node is not specified.
+         */
+        _isSummary: function(inputNode, inputSummary) {
+          if (inputNode) {
+            return this._isType(inputNode, null, 'OP') &&
+                inputNode.op.substr(-7) === 'Summary';
+          }
+          return !!inputSummary;
+        },
+
+        /**
+         * Test whether the op node is a regular non-summary non-const node.
+         */
+        _isRegularOp: function(inputNode, inputConst, inputSummary) {
+          return !this._isConst(inputNode, inputConst) &&
+              !this._isSummary(inputNode, inputSummary);
+        },
+
+        _fadedClass: function(itemRenderInfo, shape) {
+          return itemRenderInfo && itemRenderInfo.isFadedOut ? 'faded-' + shape : '';
+        }
+      });
+    })();
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-info.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-info.html
new file mode 100644
index 0000000000000000000000000000000000000000..b33e1e00d04e4836322ce0975847aa88f6b0a5d2
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-info.html
@@ -0,0 +1,354 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-slider/paper-slider.html">
+<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="tf-node-info.html">
+
+<dom-module id="tf-graph-info">
+<template>
+<style>
+:host {
+  font-size: 12px;
+  margin: 0;
+  padding: 0;
+  display: block;
+}
+
+h2 {
+  padding: 0;
+  text-align: center;
+  margin: 0;
+}
+
+.health-pill-legend {
+  padding: 15px;
+}
+
+.health-pill-legend h2 {
+  text-align: left;
+}
+
+.health-pill-entry {
+  margin: 10px 10px 10px 0;
+}
+
+.health-pill-entry .color-preview {
+  width: 26px;
+  height: 26px;
+  border-radius: 3px;
+  display: inline-block;
+  margin: 0 10px 0 0;
+}
+
+.health-pill-entry .color-label, .health-pill-entry .tensor-count {
+  color: #777;
+  display: inline-block;
+  height: 26px;
+  font-size: 22px;
+  line-height: 26px;
+  vertical-align: top;
+}
+
+.health-pill-entry .tensor-count {
+  float: right;
+}
+
+#health-pill-step-slider {
+  width: 100%;
+  margin: 0 0 0 -15px;
+  /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
+   * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
+   * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2.
+   * Apparently, the paper-slider lacks a mixin for those padding values. */
+  width: calc(100% + 31px);
+}
+
+#health-pills-loading-spinner {
+  width: 20px;
+  height: 20px;
+  vertical-align: top;
+}
+
+#health-pill-step-number-input {
+  text-align: center;
+  vertical-align: top;
+}
+</style>
+<template is="dom-if" if="{{selectedNode}}">
+  <paper-material elevation="1" class="card">
+    <tf-node-info graph-hierarchy="[[graphHierarchy]]"
+                  render-hierarchy="[[renderHierarchy]]"
+                  flat-graph="[[graph]]"
+                  node-name="[[selectedNode]]"
+                  node-include="[[selectedNodeInclude]]"
+                  highlighted-node="{{highlightedNode}}"
+                  color-by="[[colorBy]]">
+    </tf-node-info>
+  </paper-material>
+</template>
+<template is="dom-if" if="[[_healthPillsAvailable(debuggerDataEnabled, nodeNamesToHealthPills)]]">
+  <paper-material elevation="1" class="card health-pill-legend">
+    <div class="title">
+      Enable all (not just sampled) steps. Requires slow disk read.
+    </div>
+    <paper-toggle-button id="enableAllStepsModeToggle" checked="{{allStepsModeEnabled}}">
+    </paper-toggle-button>
+    <h2>
+      Step of Health Pills:
+      <template is="dom-if" if="[[allStepsModeEnabled]]">
+        <input type="number"
+               id="health-pill-step-number-input"
+               min="0"
+               max="[[_biggestStepEverSeen]]"
+               value="{{specificHealthPillStep::input}}">
+      </template>
+      <template is="dom-if" if="[[!allStepsModeEnabled]]">
+        [[_currentStepDisplayValue]]
+      </template>
+
+      <paper-spinner-lite active
+                          hidden$=[[!areHealthPillsLoading]]
+                          id="health-pills-loading-spinner"></paper-spinner-lite>
+    </h2>
+    <template is="dom-if" if="[[allStepsModeEnabled]]">
+      <paper-slider
+            id="health-pill-step-slider"
+            immediate-value="{{specificHealthPillStep}}"
+            max="[[_biggestStepEverSeen]]"
+            snaps
+            step="1"
+            value="{{specificHealthPillStep}}"></paper-slider>
+    </template>
+    <template is="dom-if" if="[[!allStepsModeEnabled]]">
+      <template is="dom-if" if="[[_maxStepIndex]]">
+        <paper-slider
+              id="health-pill-step-slider"
+              immediate-value="{{healthPillStepIndex}}"
+              max="[[_maxStepIndex]]"
+              snaps
+              step="1"
+              value="{{healthPillStepIndex}}"></paper-slider>
+      </template>
+    </template>
+    <h2>
+      Health Pill
+      <template is="dom-if" if="[[healthPillValuesForSelectedNode]]">
+        Counts for Selected Node
+      </template>
+      <template is="dom-if" if="[[!healthPillValuesForSelectedNode]]">
+        Legend
+      </template>
+    </h2>
+    <template is="dom-repeat" items="[[healthPillEntries]]">
+      <div class="health-pill-entry">
+        <div class="color-preview" style="background:[[item.background_color]]"></div>
+        <div class="color-label">[[item.label]]</div>
+        <div class="tensor-count">
+          [[_computeTensorCountString(healthPillValuesForSelectedNode, index)]]
+        </div>
+      </div>
+    </template>
+  </paper-material>
+</template>
+</template>
+<script>
+"use strict";
+
+(function() {
+  Polymer({
+    is: 'tf-graph-info',
+
+    properties: {
+      title: String,
+      graphHierarchy: Object,
+      graph: Object,
+      renderHierarchy: Object,
+      nodeNamesToHealthPills: Object,
+      healthPillStepIndex: {
+        type: Number,
+        notify: true,
+      },
+      // Only relevant if we are in all steps mode, in which case the user may want to view health
+      // pills for a specific step.
+      specificHealthPillStep: {
+        type: Number,
+        value: 0,
+        notify: true,
+      },
+      colorBy: String,
+      // Two-ways
+      selectedNode: {
+        type: String,
+        notify: true
+      },
+      highlightedNode: {
+        type: String,
+        notify: true
+      },
+      // The enum value of the include property of the selected node.
+      selectedNodeInclude: {
+        type: Number,
+        notify: true
+      },
+      // Whether debugger data is enabled for this instance of Tensorboard.
+      debuggerDataEnabled: Boolean,
+      // Whether health pills are currently being loaded, in which case we show a spinner (and the
+      // current health pills shown might be out of date).
+      areHealthPillsLoading: Boolean,
+      healthPillEntries: {
+        type: Array,
+        value: tf.graph.scene.healthPillEntries,
+        readOnly: true,
+      },
+      healthPillValuesForSelectedNode: {
+        type: Array,
+        computed: '_computeHealthPillForNode(nodeNamesToHealthPills, healthPillStepIndex, selectedNode, allStepsModeEnabled, areHealthPillsLoading)',
+      },
+      // When all-steps mode is enabled, the user can request health pills for any step. In this
+      // mode, Tensorboard makes a request every time the user drags the slider to a different step.
+      allStepsModeEnabled: {
+        type: Boolean,
+        notify: true,
+      },
+      // The biggest step value ever seen. Used to determine what steps of health pills to let the
+      // user fetch in all steps mode.
+      _biggestStepEverSeen: {
+        type: Number,
+        computed: '_computeBiggestStepEverSeen(nodeNamesToHealthPills)',
+      },
+      _maxStepIndex: {
+        type: Number,
+        computed: '_computeMaxStepIndex(nodeNamesToHealthPills)',
+      },
+      _currentStepDisplayValue: {
+        type: String,
+        computed: '_computeCurrentStepDisplayValue(nodeNamesToHealthPills, healthPillStepIndex, allStepsModeEnabled, specificHealthPillStep, areHealthPillsLoading)',
+      },
+    },
+    listeners: {
+      'node-list-item-click': '_nodeListItemClicked',
+      'node-list-item-mouseover': '_nodeListItemMouseover',
+      'node-list-item-mouseout': '_nodeListItemMouseout'
+    },
+    _nodeListItemClicked: function(event) {
+      this.selectedNode = event.detail.nodeName;
+    },
+    _nodeListItemMouseover: function(event) {
+      this.highlightedNode = event.detail.nodeName;
+    },
+    _nodeListItemMouseout: function() {
+      this.highlightedNode = null;
+    },
+    _healthPillsAvailable: function(debuggerDataEnabled, nodeNamesToHealthPills) {
+      // So long as there is a mapping (even if empty) from node name to health pills, show the
+      // legend and slider. We do that because, even if no health pills exist at the current step,
+      // the user may desire to change steps, and the slider must show for the user to do that.
+      return debuggerDataEnabled && nodeNamesToHealthPills;
+    },
+    _computeTensorCountString: function(healthPillValuesForSelectedNode, valueIndex) {
+      if (!healthPillValuesForSelectedNode) {
+        // No health pill data is available.
+        return '';
+      }
+
+      return healthPillValuesForSelectedNode[valueIndex].toFixed(0);
+    },
+    _computeHealthPillForNode: function(
+        nodeNamesToHealthPills, healthPillStepIndex, selectedNode, allStepsModeEnabled, areHealthPillsLoading) {
+      if (areHealthPillsLoading) {
+        // Health pills are loading. Do not render data that is out of date.
+        return null;
+      }
+
+      if (!selectedNode) {
+        // No node is selected.
+        return null;
+      }
+
+      const healthPills = nodeNamesToHealthPills[selectedNode];
+      if (!healthPills) {
+        // This node lacks a health pill.
+        return null;
+      }
+
+      // If all steps mode is enabled, we use the first health pill in the list because the JSON
+      // response from the server is a mapping between node name and a list of 1 health pill.
+      const healthPill = healthPills[allStepsModeEnabled ? 0 : healthPillStepIndex];
+      if (!healthPill) {
+        // This node lacks a health pill at the current step.
+        return null;
+      }
+
+      // The health pill count values start at 2. Each health pill contains 6 values.
+      return healthPill.value.slice(2, 8);
+    },
+    _computeCurrentStepDisplayValue: function(
+        nodeNamesToHealthPills,
+        healthPillStepIndex,
+        allStepsModeEnabled,
+        specificHealthPillStep,
+        areHealthPillsLoading) {
+      if (allStepsModeEnabled) {
+        // The user seeks health pills for specific step from the server.
+        return specificHealthPillStep.toFixed(0);
+      }
+
+      if (areHealthPillsLoading) {
+        // The current step is undefined.
+        return 0;
+      }
+
+      for (let nodeName in nodeNamesToHealthPills) {
+        // All nodes have the same number of steps stored, so only examine 1 node. We cannot
+        // directly index into the nodeNamesToHealthPills object because we do not have a key.
+        // If all steps mode is enabled, we only have 1 step to show.
+        return nodeNamesToHealthPills[nodeName][healthPillStepIndex].step.toFixed(0);
+      }
+
+      // The current step could not be computed.
+      return 0;
+    },
+    _computeBiggestStepEverSeen: function(nodeNamesToHealthPills) {
+      for (let nodeName in nodeNamesToHealthPills) {
+        // All nodes have the same number of steps stored, so only examine 1 node.
+        // The index is 1 less than the count. Tensorboard backend logic guarantees that the length
+        // of the array will be greater than 1.
+        var healthPills = nodeNamesToHealthPills[nodeName];
+        return Math.max(this._biggestStepEverSeen, healthPills[healthPills.length - 1].step);
+      }
+
+      // No steps seen so far. Default to 0.
+      return this._biggestStepEverSeen || 0;
+    },
+    _computeMaxStepIndex: function(nodeNamesToHealthPills) {
+      for (let nodeName in nodeNamesToHealthPills) {
+        // All nodes have the same number of steps stored, so only examine 1 node.
+        // The index is 1 less than the count. Tensorboard backend logic guarantees that the length
+        // of the array will be greater than 1.
+        return nodeNamesToHealthPills[nodeName].length - 1;
+      }
+
+      // Return a falsy value. The slider should be hidden.
+      return 0;
+    },
+  });
+})();
+</script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-info.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-info.html
new file mode 100644
index 0000000000000000000000000000000000000000..f1455acaee2b9f9cc7c5ef30c0036b3301f378e3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-info.html
@@ -0,0 +1,652 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../iron-collapse/iron-collapse.html">
+<link rel="import" href="../iron-list/iron-list.html">
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../paper-item/paper-item.html">
+<link rel="import" href="../paper-item/paper-item-body.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="tf-graph-icon.html">
+<link rel="import" href="tf-node-list-item.html">
+
+<dom-module id="tf-node-info">
+  <style>
+  .sub-list-group {
+    font-weight: 500;
+    font-size: 12pt;
+    padding-bottom: 8px;
+    width: 100%;
+  }
+
+  .sub-list {
+    max-height: 300px;
+    overflow-y: scroll;
+  }
+
+  .attr-left {
+    float: left;
+    width: 30%;
+    word-wrap: break-word;
+    color: #565656;
+    font-size: 11pt;
+    font-weight: 400;
+  }
+
+  .attr-right {
+    margin-left: 30%;
+    word-wrap: break-word;
+    color: #565656;
+    font-weight: 400;
+  }
+
+  .sub-list-table {
+    display: table;
+    width: 100%;
+  }
+
+  .sub-list-table-row {
+    display: table-row;
+  }
+
+  .sub-list-table-row .sub-list-table-cell:last-child {
+    text-align: right;
+  }
+
+  .sub-list-table-cell {
+    color: #565656;
+    display: table-cell;
+    font-size: 11pt;
+    font-weight: 400;
+    max-width: 200px;
+    padding: 0 4px;
+  }
+
+  paper-item {
+    padding: 0;
+    background: #e9e9e9;
+  }
+
+  paper-item-body[two-line] {
+    min-height: 0;
+    padding: 8px 12px 4px;
+  }
+
+  .expandedInfo {
+    padding: 8px 12px;
+  }
+
+  .controlDeps {
+    padding: 0 0 0 8px;
+  }
+
+  .node-name {
+    white-space: normal;
+    word-wrap: break-word;
+    font-size: 14pt;
+    font-weight: 500;
+  }
+
+  .node-icon {
+    float: right;
+  }
+
+  .subtitle {
+    font-size: 12pt;
+    color: #5e5e5e;
+  }
+
+  .controlLine {
+    font-size: 11pt;
+    font-weight: 400;
+  }
+
+  .toggle-button {
+    float: right;
+    max-height: 20px;
+    max-width: 20px;
+    padding: 0;
+  }
+
+  .control-toggle-button {
+    float: left;
+    max-height: 20px;
+    max-width: 20px;
+    padding: 0;
+  }
+
+  .toggle-include-group {
+    padding-top: 4px;
+  }
+
+  .toggle-include {
+    margin: 5px 6px;
+    text-transform: none;
+    padding: 4px 6px;
+    font-size: 10pt;
+    background-color: #fafafa;
+    color: #666;
+  }
+
+  .toggle-include:hover {
+    background-color: var(--google-yellow-100);
+  }
+
+  .non-control-list-item {
+    padding-left: 10px;
+  }
+  </style>
+  <template>
+    <paper-item>
+      <paper-item-body two-line>
+        <div>
+          <paper-icon-button
+            icon="{{_getToggleIcon(_expanded)}}"
+            on-click="_toggleExpanded"
+            class="toggle-button">
+          </paper-icon-button>
+          <div class="node-name" id="nodetitle"></div>
+        </div>
+        <div secondary>
+          <tf-graph-icon class="node-icon" node="[[_node]]"
+              render-info="[[_getRenderInfo(nodeName, renderHierarchy)]]"
+              color-by="[[colorBy]]"
+              template-index="[[_templateIndex]]"
+              ></tf-graph-icon>
+          <template is="dom-if" if="{{_node.op}}">
+            <div class="subtitle">
+              Operation:
+              <span>[[_node.op]]</span>
+            </div>
+          </template>
+          <template is="dom-if" if="{{_node.metagraph}}">
+            <div class="subtitle">
+              Subgraph:
+              <span>[[_node.cardinality]]</span> nodes
+            </div>
+          </template>
+        </div>
+      </paper-item-body>
+    </paper-item>
+    <iron-collapse opened="{{_expanded}}">
+    <template is="dom-if" if="{{_expanded}}" restamp="true">
+      <div class="expandedInfo">
+        <div class="sub-list-group attributes">
+          Attributes
+          (<span>[[_attributes.length]]</span>)
+          <iron-list class="sub-list" id ="attributesList"
+                    items="[[_attributes]]">
+            <template>
+              <div>
+                <div class="attr-left">[[item.key]]</div>
+                <div class="attr-right">[[item.value]]</div>
+              </div>
+            </template>
+          </iron-list>
+        </div>
+
+        <template is="dom-if" if="{{_device}}">
+          <div class="sub-list-group device">
+            <div class="attr-left">Device</div>
+            <div class="attr-right">[[_device]]</div>
+          </div>
+        </template>
+
+        <div class="sub-list-group predecessors">
+          Inputs
+          (<span>[[_totalPredecessors]]</span>)
+          <iron-list class="sub-list" id ="inputsList"
+                    items="[[_predecessors.regular]]">
+            <template>
+              <tf-node-list-item
+                  class="non-control-list-item"
+                  card-node="[[_node]]"
+                  item-node="[[item.node]]"
+                  edge-label="[[item.edgeLabel]]"
+                  item-render-info="[[item.renderInfo]]"
+                  name="[[item.name]]"
+                  item-type="predecessors"
+                  color-by="[[colorBy]]"
+                  template-index="[[_templateIndex]]">
+              </tf-node-list-item>
+            </template>
+          </iron-list>
+          <template is="dom-if" if="[[_predecessors.control.length]]">
+            <div class="controlDeps">
+              <div class="controlLine">
+                <paper-icon-button
+                  icon="{{_getToggleIcon(_openedControlPred)}}"
+                  on-click="_toggleControlPred"
+                  class="control-toggle-button">
+                </paper-icon-button>
+                Control dependencies
+              </div>
+              <iron-collapse opened="{{_openedControlPred}}" no-animation>
+                <template is="dom-if" if="{{_openedControlPred}}" restamp="true">
+                  <iron-list class="sub-list" items="[[_predecessors.control]]">
+                    <template>
+                      <tf-node-list-item
+                          card-node="[[_node]]"
+                          item-node="[[item.node]]"
+                          item-render-info="[[item.renderInfo]]"
+                          name="[[item.name]]"
+                          item-type="predecessors"
+                          color-by="[[colorBy]]"
+                          template-index="[[_templateIndex]]">
+                      </tf-node-list-item>
+                    </template>
+                  </iron-list>
+                </template>
+              </iron-collapse>
+            </div>
+          </template>
+        </div>
+
+        <div class="sub-list-group successors">
+          Outputs
+          (<span>[[_totalSuccessors]]</span>)
+          <iron-list class="sub-list" id ="outputsList"
+                    items="[[_successors.regular]]">
+            <template>
+              <tf-node-list-item
+                  class="non-control-list-item"
+                  card-node="[[_node]]"
+                  item-node="[[item.node]]"
+                  edge-label="[[item.edgeLabel]]"
+                  item-render-info="[[item.renderInfo]]"
+                  name="[[item.name]]"
+                  item-type="successor"
+                  color-by="[[colorBy]]"
+                  template-index="[[_templateIndex]]">
+              </tf-node-list-item>
+            </template>
+          </iron-list>
+          <template is="dom-if" if="[[_successors.control.length]]">
+            <div class="controlDeps">
+              <div class="controlLine">
+                <paper-icon-button
+                  icon="{{_getToggleIcon(_openedControlSucc)}}"
+                  on-click="_toggleControlSucc"
+                  class="control-toggle-button">
+                </paper-icon-button>
+                Control dependencies
+              </div>
+              <iron-collapse opened="{{_openedControlSucc}}" no-animation>
+                <template is="dom-if" if="{{_openedControlSucc}}" restamp="true">
+                  <iron-list class="sub-list" items="[[_successors.control]]">
+                    <template>
+                      <tf-node-list-item
+                          card-node="[[_node]]"
+                          item-node="[[item.node]]"
+                          item-render-info="[[item.renderInfo]]"
+                          name="[[item.name]]"
+                          item-type="successors"
+                          color-by="[[colorBy]]"
+                          template-index="[[_templateIndex]]">
+                      </tf-node-list-item>
+                    </template>
+                  </iron-list>
+                </template>
+              </iron-collapse>
+            </div>
+          </template>
+        </div>
+        <template is="dom-if" if="{{_hasDisplayableNodeStats}}">
+          <div class="sub-list-group node-stats">
+            Node Stats
+            <div class="sub-list-table">
+              <template is="dom-if" if="{{_nodeStats.totalBytes}}">
+                <div class="sub-list-table-row">
+                  <div class="sub-list-table-cell">Memory</div>
+                  <div class="sub-list-table-cell">[[_nodeStatsFormattedBytes]]</div>
+                </div>
+              </template>
+              <template is="dom-if" if="{{_getTotalMicros(_nodeStats)}}">
+                <div class="sub-list-table-row">
+                  <div class="sub-list-table-cell">Compute Time</div>
+                  <div class="sub-list-table-cell">[[_nodeStatsFormattedComputeTime]]</div>
+                </div>
+              </template>
+              <template is="dom-if" if="{{_nodeStats.outputSize}}">
+                <div class="sub-list-table-row">
+                  <div class="sub-list-table-cell">Tensor Output Sizes</div>
+                  <div class="sub-list-table-cell">
+                    <template is="dom-repeat" items="{{_nodeStatsFormattedOutputSizes}}">
+                      [[item]] <br/>
+                    </template>
+                  </div>
+                </div>
+              </template>
+            </div>
+          </div>
+        </template>
+        <div class="toggle-include-group">
+          <paper-button raised class="toggle-include" on-click="_toggleInclude">
+            <span>[[_auxButtonText]]</span>
+          </paper-button>
+        </div>
+        <template is="dom-if" if="{{_isInSeries(_node)}}">
+          <div class="toggle-include-group">
+            <paper-button raised class="toggle-include" on-click="_toggleGroup">
+              <span>[[_groupButtonText]]</span>
+            </paper-button>
+          </div>
+        </template>
+      </div>
+    </template>
+    </iron-collapse>
+  </template>
+
+  <script>
+    (function() {
+      Polymer({
+        is: 'tf-node-info',
+
+        properties: {
+          nodeName: String,
+          graphHierarchy: Object,
+          renderHierarchy: Object,
+          /** What to color the nodes by (compute time, memory, device etc.) */
+          colorBy: String,
+          _templateIndex: {
+            type: Function,
+            computed: '_getTemplateIndex(graphHierarchy)'
+          },
+          _node: {
+            type: Object,
+            computed: '_getNode(nodeName, graphHierarchy)',
+            observer: '_resetState'
+          },
+          _nodeStats: {
+            type: Object,
+            computed: '_getNodeStats(nodeName, graphHierarchy)',
+            observer: '_resetState'
+          },
+          _hasDisplayableNodeStats: {
+            type: Object,
+            computed: '_getHasDisplayableNodeStats(_nodeStats)',
+          },
+          _nodeStatsFormattedBytes: {
+            type: String,
+            computed: '_getNodeStatsFormattedBytes(_nodeStats)',
+          },
+          _nodeStatsFormattedComputeTime: {
+            type: String,
+            computed: '_getNodeStatsFormattedComputeTime(_nodeStats)',
+          },
+          _nodeStatsFormattedOutputSizes: {
+            type: Array,
+            computed: '_getNodeStatsFormattedOutputSizes(_nodeStats)',
+          },
+          // The enum value of the include property of the selected node.
+          nodeInclude: {
+            type: Number,
+            observer: '_nodeIncludeStateChanged'
+          },
+          _attributes: {
+            type: Array,
+            computed: '_getAttributes(_node)'
+          },
+          _device: {
+            type: String,
+            computed: '_getDevice(_node)'
+          },
+          _successors: {
+            type: Object,
+            computed: '_getSuccessors(_node, graphHierarchy)'
+          },
+          _predecessors: {
+            type: Object,
+            computed: '_getPredecessors(_node, graphHierarchy)'
+          },
+          _subnodes: {
+            type: Array,
+            computed: '_getSubnodes(_node)'
+          },
+          _expanded: {
+            type: Boolean,
+            value: true
+          },
+          _totalPredecessors: {
+            type: Number,
+            computed: '_getTotalPred(_predecessors)'
+          },
+          _totalSuccessors: {
+            type: Number,
+            computed: '_getTotalSucc(_successors)'
+          },
+          _openedControlPred: {
+            type: Boolean,
+            value: false
+          },
+          _openedControlSucc: {
+            type: Boolean,
+            value: false
+          },
+          _auxButtonText: String,
+          _groupButtonText: String
+        },
+        expandNode: function() {
+          this.fire('_node.expand', this.node);
+        },
+        _getTemplateIndex: function(graphHierarchy) {
+          return graphHierarchy.getTemplateIndex();
+        },
+        _getNode: function(nodeName, graphHierarchy) {
+          return graphHierarchy.node(nodeName);
+        },
+        _getNodeStats: function(nodeName, graphHierarchy) {
+          var node = this._getNode(nodeName, graphHierarchy);
+          if (node) {
+            return node.stats;
+          }
+          return null;
+        },
+        _getTotalMicros: function(stats) {
+          return stats.getTotalMicros();
+        },
+        _getHasDisplayableNodeStats: function(stats) {
+          return tf.graph.util.hasDisplayableNodeStats(stats);
+        },
+        _getNodeStatsFormattedBytes: function(stats) {
+          if (!stats || !stats.totalBytes) {
+            return;
+          }
+
+          return tf.graph.util.convertUnitsToHumanReadable(
+              stats.totalBytes, tf.graph.util.MEMORY_UNITS);
+        },
+        _getNodeStatsFormattedComputeTime: function(stats) {
+          if (!stats || !stats.getTotalMicros()) {
+            return;
+          }
+
+          return tf.graph.util.convertUnitsToHumanReadable(
+              stats.getTotalMicros(), tf.graph.util.TIME_UNITS);
+        },
+        _getNodeStatsFormattedOutputSizes: function(stats) {
+          if (!stats || !stats.outputSize || !stats.outputSize.length) {
+            return;
+          }
+
+          return _.map(stats.outputSize, function(shape) {
+            if (shape.length === 0) {
+              return "scalar";
+            }
+            return "[" + shape.join(", ") + "]";
+          });
+        },
+        _getPrintableHTMLNodeName: function(nodeName) {
+          // Insert an optional line break before each slash so that
+          // long node names wrap cleanly at path boundaries.
+          return (nodeName || '').replace(/\//g, '<wbr>/');
+        },
+        _getRenderInfo: function(nodeName, renderHierarchy) {
+          return this.renderHierarchy.getOrCreateRenderNodeByName(nodeName);
+        },
+        _getAttributes: function(node) {
+          this.async(this._resizeList.bind(this, "#attributesList"));
+          if (!node || !node.attr) {
+            return [];
+          }
+          var attrs = [];
+          _.each(node.attr, function(entry) {
+            // Unpack the "too large" attributes into separate attributes
+            // in the info card, with values "too large to show".
+            if (entry.key === tf.graph.LARGE_ATTRS_KEY) {
+              attrs = attrs.concat(entry.value.list.s.map(function(key) {
+                return {key: key, value: "Too large to show..."};
+              }));
+            } else {
+              attrs.push({
+                key: entry.key,
+                value: JSON.stringify(entry.value)
+              });
+            }
+          });
+          return attrs;
+        },
+        _getDevice: function(node) {
+          return node ? node.device : null;
+        },
+        _getSuccessors: function(node, hierarchy) {
+          this.async(this._resizeList.bind(this, "#inputsList"));
+          if (!node) {
+            return {regular: [], control: []}
+          }
+          return this._convertEdgeListToEdgeInfoList(
+            hierarchy.getSuccessors(node.name), false, node.isGroupNode);
+        },
+        _getPredecessors: function(node, hierarchy) {
+          this.async(this._resizeList.bind(this, "#outputsList"));
+          if (!node) {
+            return {regular: [], control: []}
+          }
+          return this._convertEdgeListToEdgeInfoList(
+            hierarchy.getPredecessors(node.name), true, node.isGroupNode);
+        },
+        _convertEdgeListToEdgeInfoList: function(list, isPredecessor, isGroupNode) {
+
+          /**
+           * Unpacks the metaedge into a list of base edge information
+           * that can be rendered.
+           */
+          var unpackMetaedge = function(metaedge) {
+            return _.map(metaedge.baseEdgeList, function(baseEdge) {
+              name = isPredecessor ? baseEdge.v : baseEdge.w;
+              return {
+                name: name,
+                node: this._getNode(name, this.graphHierarchy),
+                edgeLabel: tf.graph.scene.edge.getLabelForBaseEdge(baseEdge,
+                    this.renderHierarchy),
+                renderInfo: this._getRenderInfo(name, this.renderHierarchy)
+              };
+            }, this);
+          }.bind(this);
+
+          /**
+           * Converts a list of metaedges to a list of edge information
+           * that can be rendered.
+           */
+          var toEdgeInfoList = function(edges) {
+            var edgeInfoList = [];
+            _.each(edges, function(metaedge) {
+              var name = isPredecessor ? metaedge.v : metaedge.w;
+              // Enumerate all the base edges if the node is an OpNode, or the
+              // metaedge has only 1 edge in it.
+              if (!isGroupNode || metaedge.baseEdgeList.length == 1) {
+                edgeInfoList = edgeInfoList.concat(unpackMetaedge(metaedge));
+              } else {
+                edgeInfoList.push({
+                  name: name,
+                  node: this._getNode(name, this.graphHierarchy),
+                  edgeLabel: tf.graph.scene.edge.getLabelForEdge(metaedge,
+                      this.renderHierarchy),
+                  renderInfo: this._getRenderInfo(name, this.renderHierarchy)
+                });
+              }
+            }, this);
+            return edgeInfoList;
+          }.bind(this);
+
+          return {
+            regular: toEdgeInfoList(list.regular),
+            control: toEdgeInfoList(list.control)
+          };
+        },
+        _getSubnodes: function(node) {
+          return node && node.metagraph ? node.metagraph.nodes() : null;
+        },
+        _getTotalPred: function(predecessors) {
+          return predecessors.regular.length + predecessors.control.length;
+        },
+        _getTotalSucc: function(successors) {
+          return successors.regular.length + successors.control.length;
+        },
+        _toggleControlPred: function() {
+          this._openedControlPred = !this._openedControlPred;
+        },
+        _toggleControlSucc: function() {
+          this._openedControlSucc = !this._openedControlSucc;
+        },
+        _toggleExpanded: function() {
+          this._expanded = !this._expanded;
+        },
+        _getToggleIcon: function(expanded) {
+          return expanded ? "expand-less" : "expand-more";
+        },
+        _resetState: function() {
+          this._openedControlPred = false;
+          this._openedControlSucc = false;
+
+          this.set("_groupButtonText",
+            tf.graph.scene.node.getGroupSettingLabel(this._node));
+
+          if (this._node) {
+            Polymer.dom(this.$.nodetitle).innerHTML =
+              this._getPrintableHTMLNodeName(this._node.name);
+          }
+        },
+        _resizeList: function(selector) {
+          var list = document.querySelector(selector);
+          if (list) {
+            list.fire('iron-resize');
+          }
+        },
+        _toggleInclude: function() {
+          var graphElem = document.querySelector("#graph");
+          graphElem.fire("node-toggle-extract", { name: this.nodeName });
+          var graphBoardElem = document.querySelector("#graphboard");
+          graphBoardElem.fire("node-toggle-extract");
+        },
+        _nodeIncludeStateChanged: function(include, oldInclude) {
+          this.set("_auxButtonText",
+            tf.graph.getIncludeNodeButtonString(include));
+        },
+        _toggleGroup: function() {
+          var graphElem = document.querySelector("#graph");
+          var seriesName = tf.graph.scene.node.getSeriesName(this._node);
+          graphElem.fire("node-toggle-seriesgroup", { name: seriesName });
+        },
+        _isInSeries: function(node) {
+          return tf.graph.scene.node.canBeInSeries(node);
+        }
+      });
+    })();
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-list-item.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-list-item.html
new file mode 100644
index 0000000000000000000000000000000000000000..c15478d126ccbb055a7bbb46f3a29c897321a648
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-list-item.html
@@ -0,0 +1,138 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="tf-graph-icon.html">
+
+<dom-module id="tf-node-list-item">
+  <style>
+  #list-item {
+    width: 100%;
+    color: #565656;
+    font-size: 11pt;
+    font-weight: 400;
+    position: relative;
+    display: inline-block;
+  }
+
+  #list-item:hover {
+    background-color: var(--google-yellow-100);
+  }
+
+  .clickable {
+    cursor: pointer;
+  }
+
+  #list-item span {
+    margin-left: 40px;
+  }
+
+  #list-item.excluded span {
+    color: #999;
+  }
+
+  #list-item span.edge-label {
+    float: right;
+    font-size: 10px;
+    margin-left: 3px;
+    margin-right: 5px;
+  }
+
+  .node-icon {
+    position: absolute;
+    top: 1px;
+    left: 2px;
+  }
+
+  .faded span {
+    color: var(--tb-graph-faded);
+  }
+  </style>
+  <template>
+    <div id="list-item"
+         on-mouseover="_nodeListener"
+         on-mouseout="_nodeListener"
+         on-click="_nodeListener">
+      <div class$="{{_fadedClass(itemRenderInfo)}}">
+        <tf-graph-icon class="node-icon" height="12"
+            color-by="[[colorBy]]" color-by-params="[[colorByParams]]"
+            node="[[itemNode]]" render-info="[[itemRenderInfo]]"
+            template-index="[[templateIndex]]"></tf-graph-icon>
+        <span title$="[[name]]">[[name]]</span>
+        <span class="edge-label">[[edgeLabel]]</span>
+      </div>
+    </div>
+  </template>
+
+  <script>
+    (function() {
+      Polymer({
+        is: 'tf-node-list-item',
+
+        properties: {
+          /**
+           * The Node for the card itself, on which this item is being drawn.
+           * @type {tf.graph.Node}
+           */
+          cardNode: Object,
+          /**
+           * The Node for the item within the card, somehow related to cardNode.
+           * @type {tf.graph.Node}
+           */
+          itemNode: Object,
+          /** The edge label associated with this item. */
+          edgeLabel: String,
+          /**
+           * The render node information for the item node. Used by the graph
+           * icon in determining fill color.
+           */
+          itemRenderInfo: Object,
+          name: String,
+          itemType: {
+            type: String,
+            observer: '_itemTypeChanged'
+          },
+          colorBy: String,
+          colorByParams: Object,
+          templateIndex: Function
+        },
+
+        _itemTypeChanged: function() {
+          if (this.itemType !== 'subnode') {
+            this.$['list-item'].classList.add('clickable');
+          } else {
+            this.$['list-item'].classList.remove('clickable');
+          }
+        },
+
+        _nodeListener: function(event) {
+          // fire node.click/mouseover/mouseout
+          this.fire('node-list-item-' + event.type, {
+            cardNode: this.cardNode.name,
+            nodeName: this.name,
+            type: this.itemType
+          });
+        },
+
+        _fadedClass: function(itemRenderInfo) {
+          return itemRenderInfo && itemRenderInfo.isFadedOut ? 'faded' : '';
+        }
+      });
+    })();
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/BUILD b/tensorflow/tensorboard/components/tf_graph_loader/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f163f12e9a0d45d7b4e90e40f1c88ae8078b1417
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader/BUILD
@@ -0,0 +1,46 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_loader",
+    srcs = [
+        "tf-graph-loader.html",
+    ],
+    path = "/tf-graph-loader",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-loader.html",
+    ],
+    destdir = "tf-graph-loader",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b2fc04b2ebdaa16e72aa7881fecd91e87b3a87d5
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_loader/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-loader/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html b/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..2ffb2a1a59cba900252eec4169a93c4babbef094
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html
@@ -0,0 +1,75 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Loader Demo</title>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-loader-demo">
+      <template>
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            progress="{{_progress}}">
+        </tf-graph-loader>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-loader-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _progress: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_progressUpdated(_progress)',
+          ],
+          _progressUpdated(progress) {
+            // console.log the progress.
+            console.log('Progress updated.', progress);
+
+            // The graph has loaded. console.log it.
+            if (progress.value == 100) {
+              console.log('graph', this.$.loader.outGraph);
+            }
+          },
+        });
+      </script>
+    </dom-module>
+    <!-- The graph loader lacks visual elements. -->
+    <tf-graph-loader-demo></tf-graph-loader-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html b/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
index bdfb3aa2bfc4672eb2e5a8614ebf6fe14388e69a..8d59cbd2aacf4295fbfe3bfa12013b47c2c39285 100644
--- a/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
+++ b/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
@@ -16,6 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
 
 <!--
 An element which provides a filter parsing for pbtxt to graph output.
diff --git a/tensorflow/tensorboard/components/tf_graph_loader_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7e01811a578341605dbab7ed9f9fe5c88ede1d23
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/BUILD
@@ -0,0 +1,25 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_loader_d3v4",
+    srcs = [
+        "tf-graph-loader.html",
+    ],
+    path = "/tf-graph-loader",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b2fc04b2ebdaa16e72aa7881fecd91e87b3a87d5
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_loader/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-loader/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..2ffb2a1a59cba900252eec4169a93c4babbef094
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/index.html
@@ -0,0 +1,75 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Loader Demo</title>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-loader-demo">
+      <template>
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            progress="{{_progress}}">
+        </tf-graph-loader>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-loader-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _progress: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_progressUpdated(_progress)',
+          ],
+          _progressUpdated(progress) {
+            // console.log the progress.
+            console.log('Progress updated.', progress);
+
+            // The graph has loaded. console.log it.
+            if (progress.value == 100) {
+              console.log('graph', this.$.loader.outGraph);
+            }
+          },
+        });
+      </script>
+    </dom-module>
+    <!-- The graph loader lacks visual elements. -->
+    <tf-graph-loader-demo></tf-graph-loader-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/tests.html b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/index.html
similarity index 85%
rename from tensorflow/tensorboard/components/tf_storage_d3v4/tests.html
rename to tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/index.html
index 6d395b070281db464a34ffb3f20c9652ebdaf304..c8e2027f42aa25ef1c8e2d2c1f1aa68329181ebf 100644
--- a/tensorflow/tensorboard/components/tf_storage_d3v4/tests.html
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/index.html
@@ -21,10 +21,10 @@ limitations under the License.
   <meta charset="utf-8">
   <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
   <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../../polymer/polymer.html">
-  <link rel="import" href="../tf-storage.html">
+  <link rel="import" href="../tf-graph-loader.html">
 </head>
 <body>
-  <script src="storageTests.js"></script>
+  <tf-graph-loader id="loader"></tf-graph-loader>
+  <script src="loader.js"></script>
 </body>
 </html>
diff --git a/tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/loader.ts b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/loader.ts
new file mode 100644
index 0000000000000000000000000000000000000000..fcd9f7b5295756f863a6a72428862142cc716fb3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/loader.ts
@@ -0,0 +1,25 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+suite('graph loader', () => {
+  let assert = chai.assert;
+
+  test('loader exists', () => {
+    assert.isTrue(document.getElementById('loader') != null);
+  });
+
+  // TODO(bp): write tests.
+
+});
diff --git a/tensorflow/tensorboard/components/tf_graph_loader_d3v4/tf-graph-loader.html b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/tf-graph-loader.html
new file mode 100644
index 0000000000000000000000000000000000000000..8d59cbd2aacf4295fbfe3bfa12013b47c2c39285
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/tf-graph-loader.html
@@ -0,0 +1,184 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
+
+<!--
+An element which provides a filter parsing for pbtxt to graph output.
+-->
+<dom-module id="tf-graph-loader">
+</dom-module>
+
+<script>
+Polymer({
+
+  is: 'tf-graph-loader',
+
+  properties: {
+    /**
+     * @type {value: number, msg: string}
+     *
+     * A number between 0 and 100 denoting the % of progress
+     * for the progress bar and the displayed message.
+     */
+    progress: {
+      type: Object,
+      notify: true,
+    },
+    datasets: Array,
+    selectedDataset: Number,
+    selectedFile: {
+      type: Object,
+      observer: '_selectedFileChanged'
+    },
+    outGraphHierarchy: {
+      type: Object,
+      readOnly: true, //readonly so outsider can't change this via binding
+      notify: true
+    },
+    outGraph: {
+      type: Object,
+      readOnly: true, //readonly so outsider can't change this via binding
+      notify: true
+    },
+    outHierarchyParams: {
+      type: Object,
+      readOnly: true,
+      notify: true
+    },
+    outStats: {
+      type: Object,
+      readOnly: true, // This property produces data.
+      notify: true
+    }
+  },
+  observers: [
+    '_selectedDatasetChanged(selectedDataset, datasets)',
+    '_readAndParseMetadata(selectedMetadataTag)'
+  ],
+  _readAndParseMetadata: function(metadataIndex) {
+    if (metadataIndex == -1 || this.datasets[this.selectedDataset] == null ||
+        this.datasets[this.selectedDataset].runMetadata == null ||
+        this.datasets[this.selectedDataset].runMetadata[metadataIndex] == null) {
+      this._setOutStats(null);
+      return;
+    }
+    var path = this.datasets[this.selectedDataset].runMetadata[metadataIndex].path;
+    // Reset the progress bar to 0.
+    this.set('progress', {
+      value: 0,
+      msg: ''
+    });
+    var tracker = tf.graph.util.getTracker(this);
+    tf.graph.parser.fetchAndParseMetadata(path, tracker)
+    .then(function(stats) {
+      this._setOutStats(stats);
+    }.bind(this));
+  },
+  _parseAndConstructHierarchicalGraph: function(path, pbTxtFile) {
+    // Reset the progress bar to 0.
+    this.set('progress', {
+      value: 0,
+      msg: ''
+    });
+    var tracker = tf.graph.util.getTracker(this);
+    var hierarchyParams = {
+      verifyTemplate: true,
+      // If a set of numbered op nodes has at least this number of nodes
+      // then group them into a series node.
+      seriesNodeMinSize: 5,
+      // A map of series node names to series grouping settings, to indicate
+      // if a series is to be rendered as grouped or ungrouped.
+      // Starts out empty which allows the renderer to decide which series
+      // are initially rendered grouped and which aren't.
+      seriesMap: {},
+    };
+    this._setOutHierarchyParams(hierarchyParams);
+    var dataTracker = tf.graph.util.getSubtaskTracker(tracker, 30, 'Data');
+    tf.graph.parser.fetchAndParseGraphData(path, pbTxtFile, dataTracker)
+    .then(function(graph) {
+      if (!graph) {
+        throw 'The graph is empty. Make sure that the graph is passed to the ' +
+            'SummaryWriter after the graph is defined.';
+      }
+
+      // Build the flat graph (consists only of Op nodes).
+
+      // This is the whitelist of inputs on op types that are considered
+      // reference edges. "Assign 0" indicates that the first input to
+      // an OpNode with operation type "Assign" is a reference edge.
+      var refEdges = {};
+      refEdges["Assign 0"] = true;
+      refEdges["AssignAdd 0"] = true;
+      refEdges["AssignSub 0"] = true;
+      refEdges["assign 0"] = true;
+      refEdges["assign_add 0"] = true;
+      refEdges["assign_sub 0"] = true;
+      refEdges["count_up_to 0"] = true;
+      refEdges["ScatterAdd 0"] = true;
+      refEdges["ScatterSub 0"] = true;
+      refEdges["ScatterUpdate 0"] = true;
+      refEdges["scatter_add 0"] = true;
+      refEdges["scatter_sub 0"] = true;
+      refEdges["scatter_update 0"] = true;
+      var buildParams = {
+        enableEmbedding: true,
+        inEmbeddingTypes: ['Const'],
+        outEmbeddingTypes: ['^[a-zA-Z]+Summary$'],
+        refEdges: refEdges
+      };
+      var graphTracker = tf.graph.util.getSubtaskTracker(tracker, 20, 'Graph');
+      return tf.graph.build(graph, buildParams, graphTracker);
+    })
+    .then(function(graph) {
+      this._setOutGraph(graph);
+      var hierarchyTracker = tf.graph.util.getSubtaskTracker(tracker, 50,
+          'Namespace hierarchy');
+      return tf.graph.hierarchy.build(graph, hierarchyParams, hierarchyTracker);
+    }.bind(this))
+    .then(function(graphHierarchy) {
+      // Update the properties which notify the parent with the
+      // graph hierarchy and whether the data has live stats or not.
+      this._setOutGraphHierarchy(graphHierarchy);
+    }.bind(this))
+    .catch(function(e) {
+      // Generic error catch, for errors that happened outside
+      // asynchronous tasks.
+      tracker.reportError("Graph visualization failed: " + e, e);
+    });
+  },
+  _selectedDatasetChanged: function(datasetIndex, datasets) {
+    this._parseAndConstructHierarchicalGraph(datasets[datasetIndex].path);
+  },
+  _selectedFileChanged: function(e) {
+    if (!e) {
+      return;
+    }
+    var file = e.target.files[0];
+    if (!file) {
+      return;
+    }
+
+    // Clear out the value of the file chooser. This ensures that if the user
+    // selects the same file, we'll re-read it.
+    e.target.value = '';
+
+    this._parseAndConstructHierarchicalGraph(null, file);
+  }
+});
+</script>
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3f17bda34cdd67389ce85277ef1cb7b602ea8dab
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD
@@ -0,0 +1,62 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_histogram_dashboard",
+    srcs = [
+        "tf-histogram-dashboard.html",
+    ],
+    path = "/tf-histogram-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-histogram-dashboard.html",
+        ":legacy_ts",
+    ],
+    destdir = "tf-histogram-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//tensorflow/tensorboard/components/tf_backend:legacy",
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries:legacy",
+        "//third_party/javascript/polymer/v1/iron-collapse:lib",
+        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+        "//third_party/javascript/polymer/v1/paper-styles:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+# This is needed: components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6522d3ad9958b21c1d0285cdf37ddd561f725a59
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_histogram_dashboard/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-histogram-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7e661b84f5d5248f19b77ba8cba65bd2abf7d665
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "data",
+    srcs = glob(["*"]),
+    path = "/tf-histogram-dashboard/demo/data",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8ef115e6d5ad92afc8fab222cba136082134fc52
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_histogram_dashboard_d3v4",
+    srcs = ["tf-histogram-dashboard.html"],
+    path = "/tf-histogram-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-histogram-dashboard",
+    deps = [
+        ":tf_histogram_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run1_tag_histo1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5600a356e8277e58be3b2891c3e328d058b5d08
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run1_tag_histo1.json
@@ -0,0 +1 @@
+[[400.0, 40, [-0.3584790755077172, 3.0267252195784047, 20.0, 24.012225532303315, 48.29045006426564, [-0.35363819004775493, -0.29226296698161564, -0.19961953895336082, 0.3214892636797772, 0.5177616740489182, 0.56953784145381, 0.6264916255991911, 0.7580548669750213, 0.8338603536725235, 1.220854943811942, 1.3429404381931362, 1.47723448201245, 1.624957930213695, 1.7874537232350647, 1.9661990955585713, 2.379100905625872, 2.6170109961884593, 3.1665833053880363], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo1.json
new file mode 100644
index 0000000000000000000000000000000000000000..407c375d2fc710e70408a3238df3a6165e964e84
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo1.json
@@ -0,0 +1 @@
+[[400.0, 40, [-2.599286228987632, 3.5098048900144323, 20.0, 10.792285491200078, 66.66796979177158, [-2.379100905625872, -1.9661990955585713, -1.624957930213695, -1.47723448201245, -1.109868130738129, -1.0089710279437536, -0.42790220995778355, -0.2195814928486969, 0.47069243095356195, 0.7580548669750213, 0.917246389039776, 1.3429404381931362, 1.624957930213695, 1.7874537232350647, 2.1628190051144287, 2.6170109961884593, 2.8787120958073054, 3.8315657995195243], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo2.json
new file mode 100644
index 0000000000000000000000000000000000000000..752b621ab032f24805574708e1659c7139a701a8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo2.json
@@ -0,0 +1 @@
+[[400.0, 40, [-0.8286852465281818, 2.0954239138728523, 20.0, 13.546880465642861, 24.14836803774091, [-0.7580548669750213, -0.38900200905253046, -0.06996543062044111, 0.07696197368248522, 0.19961953895336082, 0.2656936063469233, 0.29226296698161564, 0.5177616740489182, 0.7580548669750213, 0.917246389039776, 1.109868130738129, 1.220854943811942, 1.624957930213695, 2.1628190051144287], [2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 3.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/logdir b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/logdir
new file mode 100644
index 0000000000000000000000000000000000000000..b6362b45d777266d6204b23884222a080f789f71
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/runs.json b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/runs.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbe657af6b610f0cb3bd8b5f6ccc2b14f4e631e2
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/runs.json
@@ -0,0 +1,4 @@
+{
+	"run1": {"histograms": ["histo1"]}, 
+	"run2": {"histograms": ["histo2", "histo1"]}
+}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/index.html b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..c8d02f990d37eb625f50ef8c28753b5f491d508f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/index.html
@@ -0,0 +1,61 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../paper-styles/typography.html">
+<link rel="import" href="tf-histogram-dashboard.html">
+
+<title>Distribution Dashboard Demo</title>
+<style>
+  #container {
+    height: 800px;
+    display: block;
+  }
+
+  html, body {
+    margin: 0;
+    padding: 0;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="histogram-dash-demo">
+      <template>
+        <tf-histogram-dashboard id="demo" backend="[[backend]]"></tf-histogram-dashboard>
+      </template>
+      <script>
+        Polymer({
+          is: "histogram-dash-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                var router = new TF.Backend.router("data", true);
+                return new TF.Backend.Backend(router);
+              },
+            },
+          },
+        });
+      </script>
+    </dom-module>
+    <histogram-dash-demo id="container"></histogram-dash-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/tf-histogram-dashboard.html b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/tf-histogram-dashboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..d9967961ebb8ec0d8dd1460df83e73cad21f0ddc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/tf-histogram-dashboard.html
@@ -0,0 +1,163 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+<link rel="import" href="../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="../tf-dashboard-common/tf-categorizer.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
+<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
+<link rel="import" href="../tf-dashboard-common/tf-option-selector.html">
+<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../vz-histogram-timeseries/vz-histogram-timeseries.html">
+<link rel="import" href="../iron-collapse/iron-collapse.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+
+<!--
+tf-histogram-dashboard is a complete frontend that loads runs from a backend,
+and creates chart panes that display data for those runs.
+
+It provides a mode and time property selector, together with the selectors
+provided by tf-sidebar-helper, by which the user can customize how data is
+organized and displayed.
+
+Each chart has a button that can toggle whether it is "selected"; selectedRuns
+charts are larger.
+
+Organizationally, the #plumbing div contains components that have no concrete
+manifestation and just effect data bindings or data loading. The .sidebar div
+contains shared controls provided by tf-sidebar-helper. The .center div
+contains vz-histogram-timeseries embedded inside tf-panes-helper's.
+-->
+<dom-module id="tf-histogram-dashboard">
+  <template>
+    <div id="plumbing">
+      <tf-color-scale
+        id="colorScale"
+        runs="[[runs]]"
+        out-color-scale="{{_colorScale}}"
+      ></tf-color-scale>
+    </div>
+
+    <tf-dashboard-layout>
+      <div class="sidebar">
+        <tf-sidebar-helper
+          backend="[[backend]]"
+          categories="{{_categories}}"
+          color-scale="[[_colorScale]]"
+          run2tag="[[run2tag]]"
+          runs="[[runs]]"
+          selected-runs="{{_selectedRuns}}"
+          show-download-links="{{_showDownloadLinks}}"
+          >
+          <div class="sidebar-section">
+            <tf-option-selector
+              id="histogramModeSelector"
+              name="Histogram Mode"
+              selected-id="{{_histogramMode}}"
+              >
+              <paper-button id="overlay">overlay</paper-button>
+              <paper-button id="offset">offset</paper-button>
+            </tf-option-selector>
+          </div>
+          <div class="sidebar-section">
+            <tf-option-selector
+              id="timePropertySelector"
+              name="Offset Time Axis"
+              selected-id="{{_timeProperty}}"
+              >
+              <paper-button id="step">step</paper-button>
+              <paper-button id="relative">relative</paper-button>
+              <paper-button id="wall_time">wall</paper-button>
+            </tf-option-selector>
+          </tf-sidebar-helper>
+       </div>
+      </div>
+
+      <div class="center">
+        <tf-panes-helper
+          categories="[[_categories]]"
+          color-scale="[[_colorScale]]"
+          data-type="[[dataType]]"
+          data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
+          run2tag="[[run2tag]]"
+          selected-runs="[[_selectedRuns]]"
+          repeat-for-runs
+          >
+          <template>
+            <vz-histogram-timeseries
+              time-property="[[_timeProperty]]"
+              mode="[[_histogramMode]]"
+              color-scale="[[_colorScaleFunction]]"
+              ></vz-histogram-timeseries>
+          </template>
+        </tf-panes-helper>
+      </div>
+    </tf-dashboard-layout>
+
+    <style include="dashboard-style"></style>
+    <style>
+      tf-panes-helper {
+        --card-expanded-height: 500px;
+        --card-expanded-width: 700px;
+      }
+    </style>
+  </template>
+
+  <script>
+    TF.Dashboard.TfHistogramDashboard = Polymer({
+      is: "tf-histogram-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
+      behaviors: [
+        TF.Dashboard.DashboardBehavior("histograms"),
+        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
+        TF.Backend.BackendBehavior,
+      ],
+      properties: {
+        backend: Object,
+        dataType: {
+          type: String,
+          value: "histogram"
+        },
+        _histogramMode: {
+          type: String,
+          value: "offset"
+        },
+        _timeProperty: {
+          type: String,
+          value: "step"
+        },
+        _colorScaleFunction: {
+          type: Function,
+          computed: "_getColorScaleFunction(_colorScale)"
+        },
+      },
+      attached: function() {
+        this.async(function() {
+          this.fire("rendered");
+        });
+      },
+      _getColorScaleFunction: function() {
+        return this._colorScale.scale.bind(this._colorScale);
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/BUILD b/tensorflow/tensorboard/components/tf_image_dashboard/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a82decd16e444a84d191ed0c9958d6e524fa7c9f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/BUILD
@@ -0,0 +1,59 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_image_dashboard",
+    srcs = [
+        "tf-image-dashboard.html",
+        "tf-image-loader.html",
+    ],
+    path = "/tf-image-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "@org_polymer",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-image-dashboard.html",
+        "tf-image-loader.html",
+        ":legacy_ts",
+    ],
+    destdir = "tf-image-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend:legacy",
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+    ],
+)
+
+# This is needed: components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_image_dashboard/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..730685ee2c44bc922e72c8505bc7b9752c0c1d63
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/demo/BUILD
@@ -0,0 +1,25 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_image_dashboard/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-image-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_image_dashboard",
+        "//tensorflow/tensorboard/components/tf_image_dashboard/demo/data",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0e4c3040685aeb477f188f5c9b0729851600ed0f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "data",
+    srcs = glob(["*"]),
+    path = "/tf-image-dashboard/demo/data",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
index f667520fb5734949e35c15e0a3af541dd378c4ed..d9ba013dcea072af71ce5f792b200c8acd1cf0a2 100644
--- a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
@@ -108,6 +108,8 @@ future for loading older images.
     </style>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-image-loader",
       properties: {
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..fcb242d5da1c4372b2a05508202dd7c9a8b98e4e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_image_dashboard_d3v4",
+    srcs = [
+        "tf-image-dashboard.html",
+        "tf-image-loader.html",
+    ],
+    path = "/tf-image-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "@org_polymer",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-image-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_image_dashboard_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im1_2Fimage_2F0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dec43221348810a9447e385ea3d17e12ce58bcf
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im1_2Fimage_2F0.json
@@ -0,0 +1,9 @@
+[
+  {
+    "wall_time":1459200389.088045,
+    "width":4,
+    "height":4,
+    "step":0,
+    "query":"tag=im1%2Fimage%2F0&index=0&run=run1"
+  }
+]
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im2_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im2_2Fimage_2F0.json
new file mode 100644
index 0000000000000000000000000000000000000000..16152b8626a3260227b4aad8deadf24306d8c4ba
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im2_2Fimage_2F0.json
@@ -0,0 +1,9 @@
+[
+  {
+    "wall_time":1459200389.093653,
+    "width":4,
+    "height":4,
+    "step":0,
+    "query":"tag=im2%2Fimage%2F0&index=0&run=run1"
+  }
+]
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run2_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run2_tag_im1_2Fimage_2F0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a717b79c5def825bf7c7eec229e2f1a85971fc9b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run2_tag_im1_2Fimage_2F0.json
@@ -0,0 +1,9 @@
+[
+  {
+    "wall_time":1459200389.117463,
+    "width":4,
+    "height":4,
+    "step":0,
+    "query":"tag=im1%2Fimage%2F0&index=0&run=run2"
+  }
+]
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png
new file mode 100644
index 0000000000000000000000000000000000000000..346fd0076be28b9338152c4d49a32fc5ed685e44
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png differ
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png
new file mode 100644
index 0000000000000000000000000000000000000000..26d2d10acaf8511efeb03169853092d09252215b
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png differ
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png
new file mode 100644
index 0000000000000000000000000000000000000000..6c4190629429e0929962c4f20bd1a1602620e4bd
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png differ
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/logdir b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/logdir
new file mode 100644
index 0000000000000000000000000000000000000000..c7d82022cc061502c5991a22e72c214918a9f87b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/runs.json b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/runs.json
new file mode 100644
index 0000000000000000000000000000000000000000..b75de5b6614a77e9f0e13ea6ab134f01413668ad
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/runs.json
@@ -0,0 +1,13 @@
+{
+   "run1":{
+      "images":[
+         "im1/image/0",
+         "im2/image/0"
+      ]
+   },
+   "run2":{
+      "images":[
+         "im1/image/0"
+      ]
+   }
+}
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/index.html b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..f9ea187952f0e55a9cd267a4395d8d55ddd820c8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/index.html
@@ -0,0 +1,66 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+  <head>
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="tf-image-dashboard.html">
+    <title>Image Dashboard Demo</title>
+    <style>
+      #container{
+        width: 1000px;
+        height: 800px;
+        border: 2px solid grey;
+      }
+      html,body {
+        height: 100%;
+      }
+    </style>
+  </head>
+  <body>
+    <demo-snippet>
+      <template>
+        <dom-module id="image-dash-demo">
+          <template>
+            <tf-image-dashboard id="demo" backend="[[backend]]">
+            </tf-image-dashboard>
+          </template>
+          <script>
+            Polymer({
+              is: "image-dash-demo",
+              properties: {
+                backend: {
+                  type: Object,
+                  value: function() {
+                    var path = "data";
+                    var router = new TF.Backend.router(path, true);
+                    return new TF.Backend.Backend(router);
+                  },
+                },
+              },
+            });
+          </script>
+        </dom-module>
+        <div id="container">
+          <image-dash-demo></image-dash-demo>
+        </div>
+      </template>
+    </demo-snippet>
+  </body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-dashboard.html b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-dashboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..0700a8c0e7622a35355315132511a8cd69a39ef1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-dashboard.html
@@ -0,0 +1,156 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-dialog/paper-dialog.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+<link rel="import" href="../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
+<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
+<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
+<link rel="import" href="tf-image-loader.html">
+
+<!--
+tf-image-dashboard displays a dashboard that loads images from a TensorFlow run.
+-->
+<dom-module id="tf-image-dashboard">
+  <template>
+    <paper-dialog with-backdrop id="actual-image-size-dialog"></paper-dialog>
+    <div id="plumbing">
+      <tf-color-scale
+        id="colorScale"
+        runs="[[runs]]"
+        out-color-scale="{{_colorScale}}"
+        ></tf-color-scale>
+    </div>
+
+    <tf-dashboard-layout>
+      <div class="sidebar">
+        <tf-sidebar-helper
+          backend="[[backend]]"
+          categories="{{_categories}}"
+          color-scale="[[_colorScale]]"
+          run2tag="[[run2tag]]"
+          runs="[[runs]]"
+          selected-runs="{{_selectedRuns}}"
+          >
+        </tf-sidebar-helper>
+      </div>
+      <div class="center">
+        <tf-panes-helper
+          categories="[[_categories]]"
+          color-scale="[[_colorScale]]"
+          data-type="[[dataType]]"
+          data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
+          run2tag="[[run2tag]]"
+          selected-runs="[[_selectedRuns]]"
+          repeat-for-runs
+          >
+          <template>
+            <tf-image-loader color-scale="[[_colorScale]]"></tf-image-loader>
+            <paper-icon-button
+              class="actual-size-button"
+              icon="aspect-ratio"
+              on-tap="_showActualSize"
+              title="Show the image at its true pixel size"
+              ></paper-icon-button>
+          </template>
+        </tf-panes-helper>
+      </div>
+    </tf-dashboard-layout>
+    <style include="dashboard-style"></style>
+    <style>
+      tf-panes-helper {
+        --card-width: 340px;
+        --card-height: auto;
+        --card-expanded-width: 700px;
+        --card-expanded-height: auto;
+      }
+
+      .actual-size-button {
+        background: #fff;
+        border-radius: 100%;
+        bottom: -35px;
+        color: #2196f3;
+        height: 32px;
+        left: 35px;
+        padding: 4px;
+        pointer-events: auto;
+        position: absolute;
+        width: 32px;
+      }
+
+      .actual-size-button-selected {
+        background: var(--tb-ui-light-accent);
+      }
+
+      #actual-image-size-dialog {
+        overflow: auto;
+      }
+    </style>
+  </template>
+  <script>
+    TF.Dashboard.TfImageDashboard = Polymer({
+      is: "tf-image-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
+      properties: {
+        backend: Object,
+        dataType: {
+          type: String,
+          value: "image"
+        },
+      },
+      behaviors: [
+        TF.Dashboard.DashboardBehavior("images"),
+        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
+        TF.Backend.BackendBehavior,
+      ],
+      attached: function() {
+        this.async(function() {
+          this.fire("rendered");
+        });
+      },
+      _showActualSize: function(e) {
+        var currentTarget = Polymer.dom(e.currentTarget);
+        var card = currentTarget.node.closest('.card');
+
+        // Create a full-size copy of the image.
+        var newImage = card.querySelector('#img').cloneNode();
+        newImage.style.height = 'auto';
+        newImage.style.width = 'auto';
+        newImage.style.margin = 0;
+        newImage.style.padding = 0;
+        newImage.classList.add("actual-size-image");
+
+        // When the user clicks on the image, empty and close the dialog.
+        var dialog = this.$$('#actual-image-size-dialog');
+        newImage.addEventListener('click', function() {
+          dialog.close();
+        });
+
+        // Update dialog content. Show the dialog.
+        dialog.innerHTML = '';
+        dialog.appendChild(newImage);
+        dialog.open();
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-loader.html b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-loader.html
new file mode 100644
index 0000000000000000000000000000000000000000..d9ba013dcea072af71ce5f792b200c8acd1cf0a2
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-loader.html
@@ -0,0 +1,233 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-slider/paper-slider.html">
+<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../tf-imports/d3.html">
+
+<!--
+tf-image-loader loads an individual image from the TensorBoard backend.
+
+Right now it always loads the most recent image. We should add support in the
+future for loading older images.
+-->
+<dom-module id="tf-image-loader">
+  <template>
+    <div id="image-annotation">
+      <template is="dom-if" if="[[_hasAtLeastOneStep]]">
+        step
+        <span class="step-value">
+          [[_stepValue]]
+        </span>
+        <template is="dom-if" if="[[_currentWallTime]]">
+          ([[_currentWallTime]])
+        </template>
+        <paper-spinner-lite active hidden$=[[!_isImageLoading]]></paper-spinner-lite>
+      </template>
+      <template is="dom-if" if="[[_hasMultipleSteps]]">
+        <paper-slider
+          id="steps"
+          immediate-value="{{_stepIndex}}"
+          max="[[_maxStepIndex]]"
+          max-markers="[[_maxStepIndex]]"
+          snaps
+          step="1"
+          value="{{_stepIndex}}"></paper-slider>
+      </template>
+    </div>
+
+    <div id="main-image-container"></div>
+
+    <style>
+      :host {
+        display: block;
+        width: 100%;
+        height: auto;
+        position: relative;
+        --step-slider-knob-color: #424242;
+      }
+
+      #image-annotation {
+        border-left: 4px solid;
+        padding-left: 5px;
+        font-size: 12px;
+        margin: -10px 0 10px 0;
+      }
+
+      #image-annotation .step-value {
+        font-weight: bold;
+      }
+
+      #image-annotation paper-spinner-lite {
+        width: 14px;
+        height: 14px;
+        vertical-align: text-bottom;
+        --paper-spinner-color: var(--tb-orange-strong)
+      }
+
+      #steps {
+        height: 15px;
+        margin: 0 0 0 -15px;
+        /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
+         * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
+         * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2. */
+        width: calc(100% + 31px);
+        --paper-slider-active-color: var(--step-slider-knob-color);
+        --paper-slider-knob-color: var(--step-slider-knob-color);
+        --paper-slider-pin-color: var(--step-slider-knob-color);
+        --paper-slider-knob-start-color: var(--step-slider-knob-color);
+        --paper-slider-knob-start-border-color: var(--step-slider-knob-color);
+        --paper-slider-pin-start-color: var(--step-slider-knob-color);
+      }
+
+      #main-image-container img {
+        border: 1px solid #f5f5f5;
+        image-rendering: -moz-crisp-edges;
+        image-rendering: pixelated;
+        display: block;
+        width: 100%;
+        height: auto;
+      }
+    </style>
+  </template>
+  <script>
+    "use strict";
+
+    Polymer({
+      is: "tf-image-loader",
+      properties: {
+        colorScale: Object,
+        run: String,
+        // This is an array of Tensorboard Image&Datum objects (See backend.ts for details). The
+        // properties of objects in this array are
+        // {
+        //   width: number,
+        //   height: number,
+        //   wall_time: Date,
+        //   step: number,
+        //   url: string,
+        // }
+        _steps: {
+          type: Array,
+          value: [],
+          notify: true,
+        },
+        _stepIndex: {
+          type: Number,
+          notify: true,
+        },
+        _hasAtLeastOneStep: {
+          type: Boolean,
+          computed: "_computeHasAtLeastOneStep(_steps)",
+        },
+        _hasMultipleSteps: {
+          type: Boolean,
+          computed: "_computeHasMultipleSteps(_steps)",
+        },
+        _stepValue: {
+          type: Number,
+          computed: "_computeStepValue(_stepIndex)",
+        },
+        _currentWallTime: {
+          type: Number,
+          computed: "_computeCurrentWallTime(_stepIndex)",
+        },
+        _maxStepIndex: {
+          type: Number,
+          computed: "_computeMaxStepIndex(_steps)",
+        },
+        // We use a strictly increasing index to make sure that we don't settle on a stale image.
+        _currentImageLoadIndex: {
+          type: Number,
+          value: 1,
+        },
+        _isImageLoading: {
+          type: Boolean,
+          value: false,
+        },
+      },
+      observers: [
+        "_updateImageUrl(_steps, _stepIndex)",
+      ],
+      redraw: function() {
+        // Other dashboards logic requires a redraw method to be defined. redraw is called at
+        // various places such as when the image is expanded.
+        this.setSeriesData(this.run, this._steps);
+      },
+      setVisibleSeries: function(runs) {
+        // Do nothing.
+      },
+      setSeriesData: function(run, steps) {
+        this.set("run", run);
+        this.set("_steps", steps);
+        this.set("_stepIndex", steps.length - 1);
+
+        // Update the border color based on the run.
+        var color = this.colorScale.scale(run);
+        this.$$("#image-annotation").style.borderColor = color;
+      },
+      _updateImageUrl: function(steps, stepIndex) {
+        // We manually change the image URL (instead of binding to the image's src attribute)
+        // because we would like to manage what happens when the image starts to / finishes loading.
+        if (!steps.length) {
+          return;
+        }
+
+        let img = new Image();
+        img.id = "img"; // '#img' used to select the image in tf-image-dashboard.
+
+        const loadIndex = ++this._currentImageLoadIndex;
+        img.onload = img.onerror = (function() {
+          if (loadIndex != this._currentImageLoadIndex) {
+            // This load is no longer relevant.
+            return;
+          }
+
+          // The new image has finished loading. Remove the old image. Add the new one.
+          let mainImageContainer = this.$$("#main-image-container");
+          mainImageContainer.innerHTML = "";
+          Polymer.dom(mainImageContainer).appendChild(img);
+
+          // The image has finished loading (or has erred and failed to load).
+          this.set("_isImageLoading", false);
+        }).bind(this);
+
+        // Load the new image.
+        this.set("_isImageLoading", true);
+        img.src = steps[stepIndex].url;
+      },
+      _computeHasAtLeastOneStep: function(steps) {
+        return !!steps && steps.length > 0;
+      },
+      _computeHasMultipleSteps: function(steps) {
+        return !!steps && steps.length > 1;
+      },
+      _computeStepValue: function(stepIndex) {
+        return this._steps[stepIndex].step;
+      },
+      _computeCurrentWallTime: function(stepIndex) {
+        return this._steps[stepIndex].wall_time.toString();
+      },
+      _computeMaxStepIndex: function(steps) {
+        return steps.length - 1;
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_imports/BUILD b/tensorflow/tensorboard/components/tf_imports/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ac4c41d00fa4d21df0b60912f237cada80334a75
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports/BUILD
@@ -0,0 +1,61 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "d3",
+    srcs = [
+        "d3.html",
+        "@org_d3js",
+    ],
+    path = "/tf-imports",
+)
+
+web_library(
+    name = "lodash",
+    srcs = ["lodash.html"],
+    path = "/tf-imports",
+    deps = ["@com_lodash"],
+)
+
+web_library(
+    name = "graphlib",
+    srcs = [
+        "graphlib.html",
+        "@io_github_cpettitt_graphlib",
+    ],
+    path = "/tf-imports",
+    deps = [":lodash"],
+)
+
+web_library(
+    name = "dagre",
+    srcs = [
+        "dagre.html",
+        "@io_github_cpettitt_dagre",
+    ],
+    path = "/tf-imports",
+    deps = [
+        ":graphlib",
+        ":lodash",
+    ],
+)
+
+web_library(
+    name = "plottable",
+    srcs = [
+        "plottable.html",
+        "@com_palantir_plottable//:plottable.css",
+        "@com_palantir_plottable//:plottable.js",
+    ],
+    path = "/tf-imports",
+    deps = [":d3"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_imports/dagre.html b/tensorflow/tensorboard/components/tf_imports/dagre.html
index 48fe39da7936a77b6cb5801481c7a44109e44ba8..11164dc5042f068d50a5c4546c7c5fd659862cf7 100644
--- a/tensorflow/tensorboard/components/tf_imports/dagre.html
+++ b/tensorflow/tensorboard/components/tf_imports/dagre.html
@@ -15,10 +15,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<!--
-HTML imports are non-blocking thus getting the dependency 'graphlib'
-and 'lodash' via script imports instead.
--->
-<script src="lodash.js"></script>
-<script src="graphlib.core.js"></script>
+<link rel="import" href="lodash.html">
+<link rel="import" href="graphlib.html">
+
 <script src="dagre.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/graphlib.html b/tensorflow/tensorboard/components/tf_imports/graphlib.html
index 4e19f7b008fe876d89c8a88d1067c9b1fd5646e3..783e33be0a6ee7cb2d9f54de38bf434f938eed85 100644
--- a/tensorflow/tensorboard/components/tf_imports/graphlib.html
+++ b/tensorflow/tensorboard/components/tf_imports/graphlib.html
@@ -15,5 +15,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="lodash.js"></script>
+<link rel="import" href="lodash.html">
+
 <script src="graphlib.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/lodash.html b/tensorflow/tensorboard/components/tf_imports/lodash.html
index f92aa8087999567e2f6c038b76e83dedafe05512..cbe35f10505686cb8527a92edc6aa95c164a9ec2 100644
--- a/tensorflow/tensorboard/components/tf_imports/lodash.html
+++ b/tensorflow/tensorboard/components/tf_imports/lodash.html
@@ -15,4 +15,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="lodash.js"></script>
+<script src="../lodash/lodash.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/BUILD b/tensorflow/tensorboard/components/tf_imports_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..edf759052fdecb8f0f6761a868ba6151e2f7bd2e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/BUILD
@@ -0,0 +1,430 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "lodash",
+    srcs = ["lodash.html"],
+    path = "/tf-imports",
+    deps = ["@com_lodash"],
+)
+
+web_library(
+    name = "threejs",
+    srcs = [
+        "threejs.html",
+        "@org_threejs//:OrbitControls.js",
+        "@org_threejs//:three.js",
+    ],
+    path = "/tf-imports",
+)
+
+web_library(
+    name = "numericjs",
+    srcs = [
+        "numericjs.html",
+        "@com_numericjs",
+    ],
+    path = "/tf-imports",
+)
+
+web_library(
+    name = "weblas",
+    srcs = [
+        "weblas.html",
+        "@io_github_waylonflinn_weblas",
+    ],
+    path = "/tf-imports",
+)
+
+web_library(
+    name = "graphlib",
+    srcs = [
+        "graphlib.html",
+        "@io_github_cpettitt_graphlib",
+    ],
+    path = "/tf-imports",
+    deps = [":lodash"],
+)
+
+web_library(
+    name = "dagre",
+    srcs = [
+        "dagre.html",
+        "@io_github_cpettitt_dagre",
+    ],
+    path = "/tf-imports",
+    deps = [
+        ":graphlib",
+        ":lodash",
+    ],
+)
+
+web_library(
+    name = "d3",
+    srcs = [
+        "d3.html",
+        "@org_d3js_v4",
+    ],
+    path = "/tf-imports",
+)
+
+web_library(
+    name = "plottable",
+    srcs = ["plottable.html"],
+    path = "/tf-imports",
+    deps = [
+        ":d3",
+        ":plottable_js_css",
+    ],
+)
+
+web_library(
+    name = "plottable_js_css",
+    srcs = [
+        "@com_palantir_plottable_v3//:package/plottable.css",
+        "@com_palantir_plottable_v3//:package/plottable.js",
+    ],
+    path = "/tf-imports",
+    strip_prefix = "package",
+    visibility = ["//visibility:private"],
+)
+
+# Generate single TypeScript typings file for d3.js with no ES6 imports.
+#
+# The DefinitelyTyped definition of d3 v4 was written under the assumption that
+# we want to use d3 in a modularized way. We don't want to do that because its
+# import statements use NodeJS namespaces, and the Web Compiler only supports
+# W3C, ECMA, and IETF standards.
+tensorboard_typescript_bundle(
+    name = "d3_typings",
+    out = "d3.d.ts",
+    namespace_srcs = {"d3": [
+        "d3-transition.d.ts",
+        "@org_definitelytyped_types_d3_path//:index.d.ts",
+        "@org_definitelytyped_types_d3_time//:index.d.ts",
+        "@org_definitelytyped_types_d3_dsv//:index.d.ts",
+        "@org_definitelytyped_types_d3_color//:index.d.ts",
+        "@org_definitelytyped_types_d3_selection//:index.d.ts",
+        "@org_definitelytyped_types_d3_shape//:index.d.ts",
+        "@org_definitelytyped_types_d3_scale//:index.d.ts",
+        "@org_definitelytyped_types_d3_request//:index.d.ts",
+        "@org_definitelytyped_types_d3_interpolate//:index.d.ts",
+        "@org_definitelytyped_types_d3_drag//:index.d.ts",
+        "@org_definitelytyped_types_d3_brush//:index.d.ts",
+        "@org_definitelytyped_types_d3_axis//:index.d.ts",
+        "@org_definitelytyped_types_d3_zoom//:index.d.ts",
+        "@org_definitelytyped_types_d3_array//:index.d.ts",
+        "@org_definitelytyped_types_d3_chord//:index.d.ts",
+        "@org_definitelytyped_types_d3_collection//:index.d.ts",
+        "@org_definitelytyped_types_d3_dispatch//:index.d.ts",
+        "@org_definitelytyped_types_d3_ease//:index.d.ts",
+        "@org_definitelytyped_types_d3_force//:index.d.ts",
+        "@org_definitelytyped_types_d3_format//:index.d.ts",
+        "@org_definitelytyped_types_d3_hierarchy//:index.d.ts",
+        "@org_definitelytyped_types_d3_polygon//:index.d.ts",
+        "@org_definitelytyped_types_d3_quadtree//:index.d.ts",
+        "@org_definitelytyped_types_d3_queue//:index.d.ts",
+        "@org_definitelytyped_types_d3_random//:index.d.ts",
+        "@org_definitelytyped_types_d3_timer//:index.d.ts",
+        "@org_definitelytyped_types_d3_voronoi//:index.d.ts",
+    ]},
+)
+
+# It would be nice if Plottable released a .d.ts file for plottable.js like
+# they did for previous versions.
+tensorboard_typescript_bundle(
+    name = "plottable_typings",
+    out = "plottable.d.ts",
+    namespace_srcs = {
+        "Plottable": [
+            "@com_palantir_plottable_v3//:package/build/src/core/dataset.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/core/interfaces.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/core/version.d.ts",
+        ],
+        "Plottable.Animators": [
+            "@com_palantir_plottable_v3//:package/build/src/animators/animator.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/animators/easingAnimator.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/animators/nullAnimator.d.ts",
+        ],
+        "Plottable.Axes": [
+            "@com_palantir_plottable_v3//:package/build/src/axes/axis.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/axes/categoryAxis.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/axes/numericAxis.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/axes/timeAxis.d.ts",
+        ],
+        "Plottable.Components": [
+            "@com_palantir_plottable_v3//:package/build/src/components/component.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/componentContainer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/dragBoxLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/dragLineLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/gridlines.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/group.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/guideLineLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/interpolatedColorLegend.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/label.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/legend.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/plotGroup.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/selectionBoxLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/table.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/xDragBoxLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/yDragBoxLayer.d.ts",
+        ],
+        "Plottable.Configs": [
+            "@com_palantir_plottable_v3//:package/build/src/core/config.d.ts",
+        ],
+        "Plottable.Formatters": [
+            "@com_palantir_plottable_v3//:package/build/src/core/formatters.d.ts",
+        ],
+        "Plottable.RenderController": [
+            "@com_palantir_plottable_v3//:package/build/src/core/renderController.d.ts",
+        ],
+        "Plottable.RenderPolicies": [
+            "@com_palantir_plottable_v3//:package/build/src/core/renderPolicy.d.ts",
+        ],
+        "Plottable.SymbolFactories": [
+            "@com_palantir_plottable_v3//:package/build/src/core/symbolFactories.d.ts",
+        ],
+        "Plottable.Dispatchers": [
+            "@com_palantir_plottable_v3//:package/build/src/dispatchers/dispatcher.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/dispatchers/keyDispatcher.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/dispatchers/mouseDispatcher.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/dispatchers/touchDispatcher.d.ts",
+        ],
+        "Plottable.Drawers": [
+            "@com_palantir_plottable_v3//:package/build/src/drawers/arcDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/arcOutlineDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/areaDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/canvasBuffer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/canvasDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/drawStep.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/drawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/lineDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/rectangleDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/segmentDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/svgDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/symbolDrawer.d.ts",
+        ],
+        "Plottable.Interactions": [
+            "@com_palantir_plottable_v3//:package/build/src/interactions/clickInteraction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/dragInteraction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/interaction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/keyInteraction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/panZoomInteraction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/pointerInteraction.d.ts",
+        ],
+        "Plottable.Plots": [
+            "@com_palantir_plottable_v3//:package/build/src/plots/areaPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/barPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/clusteredBarPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/commons.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/linePlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/piePlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/plot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/rectanglePlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/scatterPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/segmentPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/stackedAreaPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/stackedBarPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/waterfallPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/xyPlot.d.ts",
+        ],
+        "Plottable.Scales": [
+            "@com_palantir_plottable_v3//:package/build/src/scales/index.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/categoryScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/colorScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/interpolatedColorScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/linearScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/modifiedLogScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/quantitativeScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/scale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/timeScale.d.ts",
+        ],
+        "Plottable.Scales.TickGenerators": [
+            "@com_palantir_plottable_v3//:package/build/src/scales/tickGenerators.d.ts",
+        ],
+        "Plottable.Utils": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/addD3SelectionMulti.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/bucket.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/callbackSet.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/coerceD3.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/entityStore.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/makeEnum.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/map.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/set.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/transformAwareTranslator.d.ts",
+        ],
+        "Plottable.Utils.Array": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/arrayUtils.d.ts",
+        ],
+        "Plottable.Utils.Color": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/colorUtils.d.ts",
+        ],
+        "Plottable.Utils.DOM": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/domUtils.d.ts",
+        ],
+        "Plottable.Utils.Math": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/mathUtils.d.ts",
+        ],
+        "Plottable.Utils.Stacking": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/stackingUtils.d.ts",
+        ],
+        "Plottable.Utils.Window": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/windowUtils.d.ts",
+        ],
+    },
+    namespace_symbol_aliases = {
+        "Plottable.Animators": {
+            "AttributeToAppliedProjector": "Plottable.AttributeToAppliedProjector",
+            "SimpleSelection": "Plottable.SimpleSelection",
+        },
+        "Plottable.Axes": {
+            "Component": "Plottable.Components.Component",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "Point": "Plottable.Point",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Scale": "Plottable.Scales.Scale",
+            "Scales": "Plottable.Scales",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SpaceRequest": "Plottable.SpaceRequest",
+        },
+        "Plottable.Components": {
+            "Bounds": "Plottable.Bounds",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "IEntity": "Plottable.IEntity",
+            "Interactions": "Plottable.Interactions",
+            "Plots": "Plottable.Plots",
+            "Point": "Plottable.Point",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Scales": "Plottable.Scales",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SpaceRequest": "Plottable.SpaceRequest",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+        },
+        "Plottable.RenderController": {
+            "Component": "Plottable.Components.Component",
+            "RenderPolicies": "Plottable.RenderPolicies",
+        },
+        "Plottable.SymbolFactories": {
+            "d3Shape": "d3",
+        },
+        "Plottable.Dispatchers": {
+            "Component": "Plottable.Components.Component",
+            "Dispatchers": "Plottable.Dispatchers",
+            "Point": "Plottable.Point",
+        },
+        "Plottable.Drawers": {
+            "AttributeToAppliedProjector": "Plottable.AttributeToAppliedProjector",
+            "AttributeToProjector": "Plottable.AttributeToProjector",
+            "Dataset": "Plottable.Dataset",
+            "IAccessor": "Plottable.IAccessor",
+            "IAnimator": "Plottable.Animators.IAnimator",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+        },
+        "Plottable.Interactions": {
+            "Component": "Plottable.Components.Component",
+            "Point": "Plottable.Point",
+            "TransformableScale": "Plottable.Scales.TransformableScale",
+        },
+        "Plottable.Plots": {
+            "AppliedDrawStep": "Plottable.Drawers.AppliedDrawStep",
+            "AttributeToProjector": "Plottable.AttributeToProjector",
+            "Bounds": "Plottable.Bounds",
+            "Component": "Plottable.Components.Component",
+            "Dataset": "Plottable.Dataset",
+            "DrawStep": "Plottable.Drawers.DrawStep",
+            "Drawers": "Plottable.Drawers",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "IAccessor": "Plottable.IAccessor",
+            "IAnimator": "Plottable.Animators.IAnimator",
+            "IDrawer": "Plottable.Drawers.IDrawer",
+            "IEntity": "Plottable.IEntity",
+            "IScaleCallback": "Plottable.Scales.IScaleCallback",
+            "Plots": "Plottable.Plots",
+            "Point": "Plottable.Point",
+            "Projector": "Plottable.Projector",
+            "ProxyDrawer": "Plottable.Drawers.ProxyDrawer",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Range": "Plottable.Range",
+            "Scale": "Plottable.Scales.Scale",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+            "TransformableScale": "Plottable.Scales.TransformableScale",
+            "Utils": "Plottable.Utils",
+            "d3Shape": "d3",
+        },
+        "Plottable.Scales": {
+            "Dataset": "Plottable.Dataset",
+            "Scales": "Plottable.Scales",
+        },
+        "Plottable.Scales.TickGenerators": {
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+        },
+        "Plottable.Utils": {
+            "Bounds": "Plottable.Bounds",
+            "Component": "Plottable.Components.Component",
+            "Dataset": "Plottable.Dataset",
+            "IAccessor": "Plottable.IAccessor",
+            "Point": "Plottable.Point",
+            "Range": "Plottable.Range",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "Utils": "Plottable.Utils",
+        },
+    },
+    namespace_symbol_aliases_public = {
+        "Plottable": {
+            "Axis": "Plottable.Axes.Axis",
+            "AxisOrientation": "Plottable.Axes.AxisOrientation",
+            "ClickCallback": "Plottable.Interactions.ClickCallback",
+            "Component": "Plottable.Components.Component",
+            "ComponentCallback": "Plottable.Components.ComponentCallback",
+            "ComponentContainer": "Plottable.Components.ComponentContainer",
+            "Dispatcher": "Plottable.Dispatchers.Dispatcher",
+            "DragBoxCallback": "Plottable.Components.DragBoxCallback",
+            "DragCallback": "Plottable.Interactions.DragCallback",
+            "EaseFn": "Plottable.Animators.EaseFn",
+            "EaseName": "Plottable.Animators.EaseName",
+            "Easing": "Plottable.Animators.Easing",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "IAnimator": "Plottable.Animators.IAnimator",
+            "IDragLineCallback": "Plottable.Components.IDragLineCallback",
+            "IDrawer": "Plottable.Drawers.IDrawer",
+            "IResizeHandler": "Plottable.Components.IResizeHandler",
+            "IScaleCallback": "Plottable.Scales.IScaleCallback",
+            "Interaction": "Plottable.Interactions.Interaction",
+            "Key": "Plottable.Interactions.Key",
+            "KeyCallback": "Plottable.Interactions.KeyCallback",
+            "Null": "Plottable.Animators.Null",
+            "Plot": "Plottable.Plots.Plot",
+            "PointerCallback": "Plottable.Interactions.PointerCallback",
+            "ProxyDrawer": "Plottable.Drawers.ProxyDrawer",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Renderer": "Plottable.Plots.Renderer",
+            "Scale": "Plottable.Scales.Scale",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+            "TimeInterval": "Plottable.Axes.TimeInterval",
+            "TransformableScale": "Plottable.Scales.TransformableScale",
+            "XAlignment": "Plottable.Components.XAlignment",
+            "XYPlot": "Plottable.Plots.XYPlot",
+            "YAlignment": "Plottable.Components.YAlignment",
+        },
+    },
+)
+
+# Removes the 'declare module' block inside this file, but keeps its content.
+genrule(
+    name = "kludge_d3_transition",
+    srcs = ["@org_definitelytyped_types_d3_transition//:index.d.ts"],
+    outs = ["d3-transition.d.ts"],
+    cmd = "sed '/^declare module/d' $< | awk '/^}$$/ && !p {p++;next}1' >$@",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/d3.html b/tensorflow/tensorboard/components/tf_imports_d3v4/d3.html
index d2c6cd8b5f0efbcd4f1cd84c830e9e916d57a2b9..2772db39a85d0aacddb17a6642fe48de9bd60e18 100644
--- a/tensorflow/tensorboard/components/tf_imports_d3v4/d3.html
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/d3.html
@@ -15,4 +15,36 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+<!--
+@license
+d3
+Copyright 2010-2017 Mike Bostock
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the author nor the names of contributors may be used to
+  endorse or promote products derived from this software without specific prior
+  written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
 <script src="d3.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/dagre.html b/tensorflow/tensorboard/components/tf_imports_d3v4/dagre.html
index 48fe39da7936a77b6cb5801481c7a44109e44ba8..1e2f6ef9af63b513f3877ea6679a4a0b600924ca 100644
--- a/tensorflow/tensorboard/components/tf_imports_d3v4/dagre.html
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/dagre.html
@@ -16,9 +16,30 @@ limitations under the License.
 -->
 
 <!--
-HTML imports are non-blocking thus getting the dependency 'graphlib'
-and 'lodash' via script imports instead.
+@license
+Dagre
+Copyright (c) 2012-2014 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
 -->
-<script src="lodash.js"></script>
-<script src="graphlib.core.js"></script>
+
+<link rel="import" href="lodash.html">
+<link rel="import" href="graphlib.html">
+
 <script src="dagre.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/graphlib.html b/tensorflow/tensorboard/components/tf_imports_d3v4/graphlib.html
index 4e19f7b008fe876d89c8a88d1067c9b1fd5646e3..783e33be0a6ee7cb2d9f54de38bf434f938eed85 100644
--- a/tensorflow/tensorboard/components/tf_imports_d3v4/graphlib.html
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/graphlib.html
@@ -15,5 +15,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="lodash.js"></script>
+<link rel="import" href="lodash.html">
+
 <script src="graphlib.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/lodash.html b/tensorflow/tensorboard/components/tf_imports_d3v4/lodash.html
index f92aa8087999567e2f6c038b76e83dedafe05512..cbe35f10505686cb8527a92edc6aa95c164a9ec2 100644
--- a/tensorflow/tensorboard/components/tf_imports_d3v4/lodash.html
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/lodash.html
@@ -15,4 +15,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="lodash.js"></script>
+<script src="../lodash/lodash.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/numericjs.html b/tensorflow/tensorboard/components/tf_imports_d3v4/numericjs.html
new file mode 100644
index 0000000000000000000000000000000000000000..7559054aabaa008d8a97a41ede707a56703d4dbb
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/numericjs.html
@@ -0,0 +1,43 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+Numeric Javascript
+Copyright (C) 2011 by Sébastien Loisel
+Copyright (c) 2011 Alberto Santini <albertosantini@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<script src="numeric.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/plottable.html b/tensorflow/tensorboard/components/tf_imports_d3v4/plottable.html
index 57f9c1d6d3ad54b48f7c636b59f5776f74737a8b..2c3e10a7c443ed1377783e35b41c393ae3dfbeb1 100644
--- a/tensorflow/tensorboard/components/tf_imports_d3v4/plottable.html
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/plottable.html
@@ -15,6 +15,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+<!--
+@license
+Plottable.js
+Copyright (c) 2014-2017 Palantir Technologies, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
 <link rel="import" href="d3.html">
 <script src="plottable.js"></script>
-<link rel="stylesheet" type="text/css" href="plottable.css">
+<link rel="stylesheet" href="plottable.css">
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/threejs.html b/tensorflow/tensorboard/components/tf_imports_d3v4/threejs.html
new file mode 100644
index 0000000000000000000000000000000000000000..d6adad43b034acf640ddeef3420feb2d483d92af
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/threejs.html
@@ -0,0 +1,43 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+three.js
+Copyright (c) 2010-2013 three.js authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<script src="three.js"></script>
+<script src="OrbitControls.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/weblas.html b/tensorflow/tensorboard/components/tf_imports_d3v4/weblas.html
new file mode 100644
index 0000000000000000000000000000000000000000..054d04ea85e16cc31e8cf248d3db86cd5262ab2d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/weblas.html
@@ -0,0 +1,42 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+weblas
+Copyright (c) 2015 Waylon Flinn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+-->
+
+<script src="weblas.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_option_selector_d3v4/BUILD b/tensorflow/tensorboard/components/tf_option_selector_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db5d07b0955b3f447be4fe2185509f148c0036d5
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_option_selector_d3v4/BUILD
@@ -0,0 +1,21 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_option_selector_d3v4",
+    srcs = ["tf-option-selector.html"],
+    path = "/tf-option-selector",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_option_selector_d3v4/tf-option-selector.html b/tensorflow/tensorboard/components/tf_option_selector_d3v4/tf-option-selector.html
new file mode 100644
index 0000000000000000000000000000000000000000..d6fc9d6861ffc6c12098da224c5fea16997e6ff3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_option_selector_d3v4/tf-option-selector.html
@@ -0,0 +1,94 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+
+<!--
+tf-option-selector is a simple component that has buttons as content and
+provides a "selectedId" property that is one of the IDs of the buttons inside it.
+-->
+<dom-module id="tf-option-selector">
+  <template>
+    <div id="wrap">
+      <h3>[[name]]</h3>
+      <div class="content-wrapper"><content></content></div>
+    </div>
+    <style>
+      .content-wrapper ::content > * {
+        width: 30%;
+        font-size: 13px;
+        background: none;
+        margin-top: 10px;
+        color: var(--tb-ui-dark-accent);
+      }
+
+      .content-wrapper ::content :first-of-type {
+        margin-left: 0;
+      }
+
+      .content-wrapper ::content .selected {
+        background-color: var(--tb-ui-dark-accent);
+        color: white!important;
+      }
+
+      h3 {
+        color: var(--paper-grey-800);
+        margin: 0;
+        font-weight: normal;
+        font-size: 14px;
+        margin-bottom: 5px;
+        display: block;
+        pointer-events: none;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-option-selector",
+      properties: {
+        name: String,
+        selectedId: {
+          type: String,
+          notify: true,
+          observer: '_selectedIdChanged'
+        }
+      },
+      attached: function() {
+        this.async(function() {
+          this.getEffectiveChildren().forEach(function(node) {
+            this.listen(node, 'tap', '_selectTarget');
+          }.bind(this));
+        });
+      },
+      _selectTarget: function(e) {
+        this.selectedId = e.currentTarget.id;
+      },
+      _selectedIdChanged: function() {
+        var selected = this.queryEffectiveChildren('#' + this.selectedId);
+        if (!selected) {
+          return;
+        }
+
+        this.getEffectiveChildren().forEach(function(node) {
+          node.classList.remove("selected");
+        });
+        selected.classList.add("selected");
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c1fbf80c4b11e4dae94d03014b7042530d42de8c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD
@@ -0,0 +1,77 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_scalar_dashboard",
+    srcs = [
+        "tf-scalar-dashboard.html",
+        "tf-smoothing-input.html",
+    ],
+    path = "/tf-scalar-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/vz_line_chart",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-scalar-dashboard.html",
+        "tf-smoothing-input.html",
+    ],
+    destdir = "tf-scalar-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//tensorflow/tensorboard/components/tf_backend:legacy",
+        "//tensorflow/tensorboard/components/tf_color_scale:legacy",
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/vz_line_chart:legacy",
+        "//third_party/javascript/polymer/v1/iron-collapse:lib",
+        "//third_party/javascript/polymer/v1/paper-checkbox:lib",
+        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+        "//third_party/javascript/polymer/v1/paper-input:lib",
+        "//third_party/javascript/polymer/v1/paper-item:lib",
+        "//third_party/javascript/polymer/v1/paper-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-slider:lib",
+        "//third_party/javascript/polymer/v1/paper-styles:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+# This is needed: components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = ["//tensorflow/tensorboard/components:common_deps"],
+)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6e85abf2d79270a4f76dd6bbfa4e451d1886e28a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_scalar_dashboard/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-scalar-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c8219c07549992e611fe2b3947aff03bfd0e3ae0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "data",
+    srcs = glob(["*"]),
+    path = "/tf-scalar-dashboard/demo/data",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3b9f38feabd974dd4ece871857e4e6cbbf18ce06
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/BUILD
@@ -0,0 +1,37 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_scalar_dashboard_d3v4",
+    srcs = [
+        "tf-scalar-dashboard.html",
+        "tf-smoothing-input.html",
+    ],
+    path = "/tf-scalar-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/vz_line_chart_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e3977205cbd1c44a1dbdbfa6d33396bec96f7953
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/BUILD
@@ -0,0 +1,27 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-scalar-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4",
+        "//tensorflow/tensorboard/demo:demo_data",
+        "@org_polymer",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/logdir b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/logdir
new file mode 100644
index 0000000000000000000000000000000000000000..b6362b45d777266d6204b23884222a080f789f71
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/runs.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/runs.json
new file mode 100644
index 0000000000000000000000000000000000000000..d45f530763cb786777c5650eecd0ebaf91b9863f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/runs.json
@@ -0,0 +1,4 @@
+{
+  "run1": {"scalars": ["foo/sin", "foo/cos", "foo/square", "bar/square"]},
+  "run2": {"scalars": ["foo/cos", "foo/square", "bar/square"]}
+}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc269395b68a35f7d4481fca05063e46c79c2859
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars.json
@@ -0,0 +1 @@
+{"run2": {"foo/cos": [[0.0, 0, 2.0], [10.0, 1, 1.0806045532226562], [20.0, 2, -0.832293689250946], [30.0, 3, -1.979984998703003], [40.0, 4, -1.3072872161865234]], "bar/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]], "foo/square": [[0.0, 0, 0.0], [10.0, 1, 2.0], [20.0, 2, 8.0], [30.0, 3, 18.0], [40.0, 4, 32.0]]}, "run1": {"foo/sin": [[0.0, 0, 0.0], [10.0, 1, 0.8414709568023682], [20.0, 2, 0.9092974066734314], [30.0, 3, 0.14112000167369843], [40.0, 4, -0.756802499294281]], "foo/cos": [[0.0, 0, 1.0], [10.0, 1, 0.5403022766113281], [20.0, 2, -0.416146844625473], [30.0, 3, -0.9899924993515015], [40.0, 4, -0.6536436080932617]], "bar/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]], "foo/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]}}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_bar_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d584fb4a9e1cd0a6a56d3d87b7183f55ac52ba6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fcos.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fcos.json
new file mode 100644
index 0000000000000000000000000000000000000000..025eaa16e93110da0c50ad03486786ee6e521700
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fcos.json
@@ -0,0 +1 @@
+[[0.0, 0, 1.0], [10.0, 1, 0.5403022766113281], [20.0, 2, -0.416146844625473], [30.0, 3, -0.9899924993515015], [40.0, 4, -0.6536436080932617]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsin.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsin.json
new file mode 100644
index 0000000000000000000000000000000000000000..eae69dd78f3b5aa75acec6b5daa08720fad9adba
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsin.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 0.8414709568023682], [20.0, 2, 0.9092974066734314], [30.0, 3, 0.14112000167369843], [40.0, 4, -0.756802499294281]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d584fb4a9e1cd0a6a56d3d87b7183f55ac52ba6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_bar_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d584fb4a9e1cd0a6a56d3d87b7183f55ac52ba6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fcos.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fcos.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd3593f9d109e81bef5a10c732a9e08e60b3ef4f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fcos.json
@@ -0,0 +1 @@
+[[0.0, 0, 2.0], [10.0, 1, 1.0806045532226562], [20.0, 2, -0.832293689250946], [30.0, 3, -1.979984998703003], [40.0, 4, -1.3072872161865234]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ff9ef0551d0a3053ba16b502d0d6148057df660
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 2.0], [20.0, 2, 8.0], [30.0, 3, 18.0], [40.0, 4, 32.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..7429c87b873ec1d8fe2827c0f2215aa205a8f5c7
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/index.html
@@ -0,0 +1,64 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../../polymer/polymer.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../tf-scalar-dashboard.html">
+<link rel="import" href="../../paper-styles/typography.html">
+<link rel="import" href="../../tf-backend/tf-backend.html">
+
+<title>Scalar Dashboard Demo</title>
+<style>
+  #container {
+    height: 900px;
+    width: 100%;
+    display: block;
+  }
+
+  html, body {
+    margin: 0;
+    padding: 0;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="scalar-dash-demo">
+      <template>
+        <tf-scalar-dashboard id="demo" backend="[[backend]]"></tf-scalar-dashboard>
+      </template>
+      <script>
+        Polymer({
+          is: "scalar-dash-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                var router = new TF.Backend.router("/data", true);
+                return new TF.Backend.Backend(router);
+              },
+            },
+          },
+        });
+      </script>
+    </dom-module>
+    <scalar-dash-demo id="container"></scalar-dash-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-scalar-dashboard.html b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-scalar-dashboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..b91cd90c0371e90d5f5abc4cf07ce297ba56c386
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-scalar-dashboard.html
@@ -0,0 +1,270 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="tf-smoothing-input.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+<link rel="import" href="../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
+<link rel="import" href="../tf-dashboard-common/tf-option-selector.html">
+<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
+<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../vz-line-chart/vz-line-chart.html">
+<link rel="import" href="../iron-collapse/iron-collapse.html">
+<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../paper-menu/paper-menu.html">
+<link rel="import" href="../paper-item/paper-item.html">
+
+<!--
+tf-scalar-dashboard is a complete frontend that loads runs from a backend,
+and creates chart panes that display data for those runs.
+
+It provides a categorizer, run selector, and x type selector, by which the user
+can customize how data is organized and displayed.
+
+Each chart has a button that can toggle whether it is "expanded"; expanded
+charts are larger.
+
+Organizationally, the #plumbing div contains components that have no concrete
+manifestation and just effect data bindings or data loading. The .sidebar div
+contains shared controls provided by tf-sidebar-helper. The .center div
+contains vz-line-charts embedded inside tf-panes-helper's.
+-->
+<dom-module id="tf-scalar-dashboard">
+  <template>
+    <div id="plumbing">
+      <tf-color-scale
+        id="colorScale"
+        runs="[[runs]]"
+        out-color-scale="{{_colorScale}}"
+      ></tf-color-scale>
+    </div>
+
+    <tf-dashboard-layout>
+      <div class="sidebar">
+        <tf-sidebar-helper
+          backend="[[backend]]"
+          categories="{{_categories}}"
+          color-scale="[[_colorScale]]"
+          run2tag="[[run2tag]]"
+          runs="[[runs]]"
+          selected-runs="{{_selectedRuns}}"
+          >
+          <div class="extend-first-section">
+            <div class="line-item">
+              <paper-checkbox
+              id="download-option"
+              checked="{{_showDownloadLinks}}"
+              >Show data download links</paper-checkbox>
+            </div>
+              <div class="line-item">
+                <paper-checkbox
+                id="outliersCheckbox"
+                checked="{{_ignoreYOutliers}}"
+                >Ignore outliers in chart scaling</paper-checkbox>
+              </div>
+            <div id="tooltip-sorting">
+              <div id="tooltip-sorting-label">Tooltip sorting method:</div>
+              <paper-dropdown-menu
+                no-label-float
+                selected-item-label="{{_tooltipSortingMethod}}"
+                >
+                <paper-menu class="dropdown-content" selected="0">
+                  <paper-item>default</paper-item>
+                  <paper-item>descending</paper-item>
+                  <paper-item>ascending</paper-item>
+                  <paper-item>nearest</paper-item>
+                </paper-menu>
+              </paper-dropdown-menu>
+            </div>
+          </div>
+          <div class="sidebar-section">
+            <tf-smoothing-input
+              weight="{{_smoothingWeight}}"
+              step="0.001"
+              min="0"
+              max="1"
+              ></tf-smoothing-input>
+          </div>
+          <div class="sidebar-section">
+            <tf-option-selector
+              id="xTypeSelector"
+              name="Horizontal Axis"
+              selected-id="{{_xType}}"
+              >
+              <paper-button id="step">step</paper-button>
+              <paper-button id="relative">relative</paper-button>
+              <paper-button id="wall_time">wall</paper-button>
+            </tf-option-selector>
+          </div>
+        </tf-sidebar-helper>
+      </div>
+      <div class="center">
+        <tf-panes-helper
+          categories="[[_categories]]"
+          color-scale="[[_colorScale]]"
+          data-type="[[dataType]]"
+          data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
+          run2tag="[[run2tag]]"
+          selected-runs="[[_selectedRuns]]"
+          show-download-links="[[_showDownloadLinks]]"
+          download-link-url-function="[[scalarUrl]]"
+          >
+          <template>
+            <vz-line-chart
+              x-type="[[_xType]]"
+              color-scale="[[_colorScale]]"
+              smoothing-enabled="[[_smoothingEnabled]]"
+              smoothing-weight="[[_smoothingWeight]]"
+              tooltip-sorting-method="[[_tooltipSortingMethod]]"
+              ignore-y-outliers="[[_ignoreYOutliers]]"
+              ></vz-line-chart>
+            <paper-icon-button
+              class="log-button"
+              icon="line-weight"
+              on-tap="toggleLogScale"
+              title="Toggle y-axis log scale"
+              ></paper-icon-button>
+          </template>
+        </tf-panes-helper>
+      </div>
+    </tf-dashboard-layout>
+
+    <style include="dashboard-style"></style>
+    <style>
+      .log-button {
+        position: absolute;
+        left: 35px;
+        bottom: -35px;
+        color: #2196F3;
+        background: #fff;
+        width: 32px;
+        height: 32px;
+        padding: 4px;
+        border-radius: 100%;
+      }
+
+      .log-button-selected {
+        background: var(--tb-ui-light-accent);
+      }
+
+      #tooltip-sorting {
+        display: flex;
+        font-size: 14px;
+        margin-top: 5px;
+      }
+
+      #tooltip-sorting-label {
+        margin-top: 13px;
+        margin-left: 28px;
+      }
+
+      #tooltip-sorting paper-dropdown-menu {
+        margin-left: 10px;
+        --paper-input-container-focus-color: var(--tb-orange-strong);
+        width: 105px;
+      }
+      .line-item {
+        display: block;
+        padding-top: 5px;
+      }
+    </style>
+
+  </template>
+
+  <script>
+    TF.Dashboard.TfScalarDashboard = Polymer({
+      is: "tf-scalar-dashboard",
+      factoryImpl: function(backend, router) {
+        this.backend = backend;
+        this.router = router;
+      },
+      behaviors: [
+        TF.Dashboard.DashboardBehavior("scalars"),
+        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
+        TF.Backend.BackendBehavior,
+      ],
+      properties: {
+        backend: Object,
+        dataType: {
+          type: String,
+          value: "scalar"
+        },
+        router: Object,
+        scalarUrl: {
+          type: Function,
+          computed: "_getScalarUrl(router)"
+        },
+        _showDownloadLinks: {
+          type: Boolean,
+          notify: true,
+          value: TF.URIStorage.getBooleanInitializer('_showDownloadLinks',
+              false, true),
+          observer: '_showDownloadLinksObserver'
+        },
+        _smoothingWeight: {
+          type: Number,
+          notify: true,
+          value: TF.URIStorage.getNumberInitializer('_smoothingWeight', 0.6),
+          observer: '_smoothingWeightObserver'
+        },
+        _smoothingEnabled: {
+          type: Boolean,
+          computed: '_computeSmoothingEnabled(_smoothingWeight)'
+        },
+        _ignoreYOutliers: {
+          type: Boolean,
+          value: TF.URIStorage.getBooleanInitializer('_ignoreYOutliers', true, true),
+          observer: '_ignoreYOutliersObserver',
+        },
+        _xType: {
+          type: String,
+          value: "step"
+        }
+      },
+      attached: function() {
+        this.async(function() {
+          this.fire("rendered");
+        });
+      },
+      _getScalarUrl: function() {
+        return this.router.scalars;
+      },
+      _showDownloadLinksObserver: TF.URIStorage.getBooleanObserver(
+          '_showDownloadLinks', /*default=*/ false, /*useLocalStorage=*/ true),
+      _smoothingWeightObserver: TF.URIStorage.getNumberObserver(
+          '_smoothingWeight', 0.6),
+      _ignoreYOutliersObserver: TF.URIStorage.getBooleanObserver(
+          '_ignoreYOutliers', /*default=*/ true, /*useLocalStorage=*/true),
+      _computeSmoothingEnabled: function(_smoothingWeight) {
+        return _smoothingWeight > 0;
+      },
+      toggleLogScale: function(e) {
+        var currentTarget = Polymer.dom(e.currentTarget);
+        var button = currentTarget.parentNode.querySelector('.log-button');
+        var chart = currentTarget.parentNode.querySelector('vz-line-chart');
+
+        button.classList.toggle("log-button-selected");
+        chart.yScaleType = chart.yScaleType === 'log' ? 'linear' : 'log';
+        chart.redraw();
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-smoothing-input.html b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-smoothing-input.html
new file mode 100644
index 0000000000000000000000000000000000000000..a0760330001310e3afee0f060b563c11d063ab65
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-smoothing-input.html
@@ -0,0 +1,138 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-slider/paper-slider.html">
+<link rel="import" href="../paper-input/paper-input.html">
+<link rel="import" href="../paper-checkbox/paper-checkbox.html">
+<link rel="import" href="../tf-imports/lodash.html">
+
+<!--
+tf-smoothing-input creates an input component for exponential smoothing.
+-->
+<dom-module id="tf-smoothing-input">
+  <template>
+    <h3 class="title">Smoothing</h3>
+    <div class="smoothing-block">
+      <paper-slider
+        id="slider"
+        value="{{weight}}"
+        immediate-value="{{_immediateWeightNumberForPaperSlider}}"
+        type="number"
+        step="[[step]]"
+        min="[[min]]"
+        max="[[max]]"
+        ></paper-slider>
+      <paper-input
+        id="input"
+        label="weight"
+        no-label-float
+        value="{{_inputWeightStringForPaperInput}}"
+        type="number"
+        step="[[step]]"
+        min="[[min]]"
+        max="[[max]]"
+        ></paper-input>
+    </div>
+    <style>
+      .title {
+        color: var(--paper-grey-800);
+        margin: 0;
+        font-weight: normal;
+        font-size: 14px;
+        margin-bottom: 5px;
+      }
+
+      .smoothing-block {
+        display: flex;
+      }
+
+      paper-slider {
+        margin-left: 12px;
+        --paper-slider-knob-color: var(--tb-orange-strong);
+        --paper-slider-active-color: var(--tb-orange-strong);
+        flex-grow: 2;
+      }
+
+      paper-input {
+        --paper-input-container-focus-color: var(--tb-orange-strong);
+        --paper-input-container-input: {
+          font-size: 14px;
+        };
+        --paper-input-container-label: {
+          font-size: 14px;
+        };
+        width: 60px;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-smoothing-input",
+
+      properties: {
+        step: Number,
+        max: Number,
+        min: Number,
+
+        weight: {
+          type: Number,
+          value: 0.6,
+          notify: true
+        },
+
+        _immediateWeightNumberForPaperSlider: {
+          type: Number,
+          notify: true,
+          observer: '_immediateWeightNumberForPaperSliderChanged'
+        },
+
+        // Paper input treats values as strings even if you specify them as
+        // numbers.
+        _inputWeightStringForPaperInput: {
+          type: String,
+          notify: true,
+          observer: '_inputWeightStringForPaperInputChanged'
+        }
+      },
+
+      _updateWeight: _.debounce(function(val) {
+        this.weight = val;
+      }, 250),
+
+      _immediateWeightNumberForPaperSliderChanged: function() {
+        this._inputWeightStringForPaperInput =
+            this._immediateWeightNumberForPaperSlider.toString();
+        this._updateWeight.call(this, this._immediateWeightNumberForPaperSlider);
+      },
+
+      _inputWeightStringForPaperInputChanged: function() {
+        if (+this._inputWeightStringForPaperInput < 0) {
+          this._inputWeightStringForPaperInput = '0';
+        }
+        else if (+this._inputWeightStringForPaperInput > 1) {
+          this._inputWeightStringForPaperInput = '1';
+        }
+
+        var d = +this._inputWeightStringForPaperInput;
+        if (!isNaN(d)) {
+          this._updateWeight.call(this, d);
+        }
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_storage/BUILD b/tensorflow/tensorboard/components/tf_storage/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8b2e006367fd6a813d2efdc05c1a3472f6e1516e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage/BUILD
@@ -0,0 +1,70 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# TODO(dandelion): Add webfiles support for the test code.
+
+web_library(
+    name = "tf_storage",
+    srcs = [
+        "tf-storage.html",
+        ":ts",
+    ],
+    path = "/tf-storage",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_globals",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = [
+        "storage.ts",
+    ],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "//tensorflow/tensorboard/components/tf_globals:ts_typings",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-storage.html",
+        ":legacy_ts",
+    ],
+    visibility = ["//visibility:public"],
+    destdir = "tf-storage",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//tensorflow/tensorboard/components/tf_globals:legacy",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = ["storage.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = [
+        "//tensorflow/tensorboard/components:common_deps",
+        "//tensorflow/tensorboard/components/tf_globals:legacy_ts",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/BUILD b/tensorflow/tensorboard/components/tf_storage_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7a8cde6bde97f75ea6f05e4888cd783bbf979253
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/BUILD
@@ -0,0 +1,52 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_storage_d3v4",
+    srcs = [
+        "bundle.js",
+        "tf-storage.html",
+    ],
+    path = "/tf-storage",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_globals_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_globals_d3v4:bundle.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.URIStorage": [
+        "storage.ts",
+    ]},
+    namespace_symbol_aliases = {"TF.URIStorage": {
+        "TABS": "TF.Globals.TABS",
+        "USE_HASH": "TF.Globals.USE_HASH",
+        "getFakeHash": "TF.Globals.getFakeHash",
+        "setFakeHash": "TF.Globals.setFakeHash",
+    }},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/test/BUILD b/tensorflow/tensorboard/components/tf_storage_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..472976f0005f7cdc93250b6f9f7a8cf74190294f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/test/BUILD
@@ -0,0 +1,50 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/tf-storage/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_storage_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "//tensorflow/tensorboard/components/tf_globals_d3v4:bundle.d.ts",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.URIStorage": ["storageTests.ts"]},
+    namespace_symbol_aliases = {"TF.URIStorage": {"TABS": "TF.Globals.TABS"}},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/storageTests.ts b/tensorflow/tensorboard/components/tf_storage_d3v4/test/storageTests.ts
similarity index 96%
rename from tensorflow/tensorboard/components/tf_storage_d3v4/storageTests.ts
rename to tensorflow/tensorboard/components/tf_storage_d3v4/test/storageTests.ts
index adc4dde716a3919a897b6420ce90266cbebdde04..82dc51f05dade857f1c9cbd09bb6b215e148977a 100644
--- a/tensorflow/tensorboard/components/tf_storage_d3v4/storageTests.ts
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/test/storageTests.ts
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-import {TAB, getString, getNumber, getObject, setString, setNumber, setObject} from './storage';
-import {TABS} from '../tf_globals_d3v4/globals';
+import {TAB, getString, getNumber, getObject, setString, setNumber, setObject} from '../storage';
+import {TABS} from '../../tf-globals/globals';
 
 /* tslint:disable:no-namespace */
 describe('URIStorage', () => {
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/test/tests.html b/tensorflow/tensorboard/components/tf_storage_d3v4/test/tests.html
new file mode 100644
index 0000000000000000000000000000000000000000..e0553c7d3c46443b640610d320fefdd3bab704af
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/test/tests.html
@@ -0,0 +1,25 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<script src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../../polymer/polymer.html">
+<link rel="import" href="../tf-storage.html">
+<body>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/tf-storage.html b/tensorflow/tensorboard/components/tf_storage_d3v4/tf-storage.html
new file mode 100644
index 0000000000000000000000000000000000000000..91b8976519d6fda482c96d7669dbbdbd0f2dba35
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/tf-storage.html
@@ -0,0 +1,21 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../tf-globals/tf-globals.html">
+<link rel="import" href="../tf-imports/lodash.html">
+
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run1.pbtxt b/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run1.pbtxt
deleted file mode 100644
index 2a6af3284086b4d797ebf3598bffe286d74baddf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run1.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-node {
-  name: "a"
-  op: "matmul"
-}
-node {
-  name: "b"
-  op: "matmul"
-  input: "a:0"
-}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt b/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt
deleted file mode 100644
index a5a4d65d5c61a7cf1c208b48f841a38a03847d60..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-node {
-  name: "a"
-  op: "matmul"
-}
-node {
-  name: "b"
-  op: "matmul"
-  input: "a:0"
-}
-node {
-  name: "c"
-  op: "matmul"
-  input: "a:0"
-  input: "b:0"
-}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/data/runs.json b/tensorflow/tensorboard/components/tf_tensorboard/test/data/runs.json
deleted file mode 100644
index 10b2821b30b04b528b6476831a9ed59c3e3e094f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/data/runs.json
+++ /dev/null
@@ -1 +0,0 @@
-{"run2": {"graph": true, "histograms": [], "scalars": [], "compressedHistograms": [], "images": []}, "run1": {"graph": true, "histograms": [], "scalars": [], "compressedHistograms": [], "images": []}}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html b/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
index 371dbad1c7de7f66f21342811bdd14dc8bb7c316..7440263f888ff1c1566a1205561dc625455e87a4 100644
--- a/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
+++ b/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
@@ -57,9 +57,9 @@ allows the user to toggle between various dashboards.
         <div id="toolbar-content">
           <div class="toolbar-title">TensorBoard</div>
           <paper-tabs selected="{{modeIndex}}" noink class="tabs" id="tabs">
-            <template is="dom-repeat" items="[[_dashboards]]">
-              <template is="dom-if" if="[[_isTabEnabled(item.name)]]">
-                <paper-tab data-mode="[[item.name]]">[[item.name]]</paper-tab>
+            <template is="dom-repeat" items="[[tabs]]">
+              <template is="dom-if" if="[[_isTabEnabled(item)]]">
+                <paper-tab data-mode="[[item]]">[[item]]</paper-tab>
               </template>
             </template>
           </paper-tabs>
@@ -82,7 +82,67 @@ allows the user to toggle between various dashboards.
         </div>
       </paper-toolbar>
 
-      <div id="content" class="fit"></div>
+      <div id="content" class="fit">
+        <content id="injected-overview"></content>
+
+        <template is="dom-if" if="[[_modeIsScalars(mode)]]">
+          <tf-scalar-dashboard
+            id="scalars"
+            backend="[[_backend]]"
+            router="[[router]]"
+          ></tf-scalar-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsImages(mode)]]">
+          <tf-image-dashboard
+            id="images"
+            backend="[[_backend]]"
+          ></tf-image-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsAudio(mode)]]">
+          <tf-audio-dashboard
+            id="audio"
+            backend="[[_backend]]"
+          ></tf-audio-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsGraphs(mode)]]">
+          <tf-graph-dashboard
+            id="graphs"
+            backend="[[_backend]]"
+            debugger-data-enabled="[[_debuggerDataEnabled]]"
+          ></tf-graph-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsDistributions(mode)]]">
+          <tf-distribution-dashboard
+            id="distributions"
+            backend="[[_backend]]"
+          ></tf-distribution-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsHistograms(mode)]]">
+          <tf-histogram-dashboard
+            id="histograms"
+            backend="[[_backend]]"
+          ></tf-histogram-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsEmbeddings(mode)]]">
+          <vz-projector-dashboard
+            id="projector"
+            route-prefix="/data/plugin/projector">
+          </vz-projector-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsText(mode)]]">
+          <tf-text-dashboard
+            id="text"
+            backend="[[_backend]]">
+          </tf-text-dashboard>
+        </template>
+      </div>
     </paper-header-panel>
 
     <style>
@@ -150,6 +210,8 @@ allows the user to toggle between various dashboards.
     </style>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-tensorboard",
       behaviors: [TF.TensorBoard.AutoReloadBehavior],
@@ -173,21 +235,16 @@ allows the user to toggle between various dashboards.
             return match && match.length == 1;
           },
         },
-        _dashboards: {
-          type: Array,
-          computed: "_makeDashboardList(_backend, router, _debuggerDataEnabled)",
-        },
-        // Maps dashboard name to dashboard object.
-        _dashboardMapping: {
-          type: Object,
-          computed: "_makeDashboardMapping(_dashboards)",
-        },
         // Which tab is selected (scalars, graph, images etc).
         mode: {
           type: String,
           computed: '_getModeFromIndex(modeIndex)',
           notify: true,
-          observer: '_modeChanged',
+        },
+        tabs: {
+          type: Array,
+          readOnly: true,
+          value: TF.Globals.TABS,
         },
         // If this is set to a string, TensorBoard will switch to "demo mode"
         // and attempt to load serialized json data from that directory. You can
@@ -212,7 +269,7 @@ allows the user to toggle between various dashboards.
         return true;
       },
       _getModeFromIndex: function(modeIndex) {
-        var mode = this._dashboards[modeIndex].name;
+        var mode = this.tabs[modeIndex];
         TF.URIStorage.setString(TF.URIStorage.TAB, mode);
         return mode;
       },
@@ -224,10 +281,34 @@ allows the user to toggle between various dashboards.
         return new TF.Backend.Backend(router);
       },
       _isReloadDisabled: function(mode) {
-        return !this._debuggerDataEnabled && mode == 'graphs';
+        return !this._debuggerDataEnabled && this._modeIsGraphs(mode);
+      },
+      _modeIsScalars: function(mode) {
+        return mode === "scalars";
+      },
+      _modeIsImages: function(mode) {
+        return mode === "images";
+      },
+      _modeIsAudio: function(mode) {
+        return mode === "audio";
+      },
+      _modeIsGraphs: function(mode) {
+        return mode === "graphs";
+      },
+      _modeIsEmbeddings: function(mode) {
+        return mode === "embeddings";
+      },
+      _modeIsDistributions: function(mode) {
+        return mode === "distributions";
+      },
+      _modeIsHistograms: function(mode) {
+        return mode === "histograms";
+      },
+      _modeIsText: function(mode) {
+        return mode === "text";
       },
       selectedDashboard: function() {
-        var dashboard = this._dashboardMapping[this.mode];
+        var dashboard = this.$$("#" + this.mode);
         if (dashboard == null) {
           throw new Error(`Unable to find dashboard for mode: ${this.mode}`);
         }
@@ -241,67 +322,24 @@ allows the user to toggle between various dashboards.
           this._getModeFromHash();
         }.bind(this));
       },
-      _makeDashboardList: function(backend, router, debuggerDataEnabled) {
-        if (!backend || !router) {
-          // The dashboards require these entities. We are not ready to construct dashboards.
-          return null;
-        }
-
-        return [
-          new TF.Dashboard.TfScalarDashboard(backend, router),
-          new TF.Dashboard.TfImageDashboard(backend),
-          new TF.Dashboard.TfAudioDashboard(backend),
-          new TF.Dashboard.TfGraphDashboard(backend, debuggerDataEnabled),
-          new TF.Dashboard.TfDistributionDashboard(backend),
-          new TF.Dashboard.TfHistogramDashboard(backend),
-          new TF.Dashboard.VzProjectorDashboard('data/plugin/projector'),
-          new TF.Dashboard.TfTextDashboard(backend),
-        ];
-      },
-      _makeDashboardMapping: function(dashboards) {
-        if (!dashboards) {
-          return null;
-        }
-
-        let mapping = {};
-        dashboards.forEach(function(dashboard) {
-          mapping[dashboard.name] = dashboard;
-        });
-        return mapping;
-      },
       _getModeFromHash: function() {
         var tabName = TF.URIStorage.getString(TF.URIStorage.TAB);
-        var modeIndex;
-        for (var i = 0; i < this._dashboards.length; i++) {
-          if (this._dashboards[i].name == tabName) {
-            modeIndex = i;
-            break;
-          }
-        }
-
-        if (modeIndex === undefined && this.modeIndex == null) {
+        var modeIndex = this.tabs.indexOf(tabName);
+        if (modeIndex == -1 && this.modeIndex == null) {
           // Select the first tab as default.
           this.set('modeIndex', 0);
         }
-        if (modeIndex !== undefined && modeIndex != this.modeIndex) {
+        if (modeIndex != -1 && modeIndex != this.modeIndex) {
           this.set('modeIndex', modeIndex);
         }
       },
-      _modeChanged: function(mode) {
-        let currentDashboard = this.$.content.firstChild;
-        if (currentDashboard) {
-          this.$.content.removeChild(currentDashboard);
+      reload: function() {
+        if (this._modeIsEmbeddings(this.mode)) {
+          return;
         }
-
-        if (!mode || !this._dashboardMapping) {
+        if (!this._debuggerDataEnabled && this._modeIsGraphs(this.mode)) {
           return;
         }
-
-        // Append the new dashboard.
-        const newDashboard = this.selectedDashboard();
-        this.$.content.appendChild(newDashboard);
-      },
-      reload: function() {
         this.selectedDashboard().reload();
       },
       openSettings: function() {
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..683302bb45c9afe042be7e28b7013ac399289ff6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/BUILD
@@ -0,0 +1,80 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:vulcanize.bzl", "tensorboard_html_binary")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_tensorboard_d3v4",
+    srcs = [
+        "tf-tensorboard.html",
+        ":ts",
+    ],
+    path = "/tf-tensorboard",
+    deps = [
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_header_panel",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_tabs",
+        "@org_polymer_paper_toolbar",
+        "@org_polymer",
+        "//tensorflow/tensorboard/components/tf_audio_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_globals_d3v4",
+        # "//tensorflow/tensorboard/components/tf_graph_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_image_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4",
+        "//tensorflow/tensorboard/components/tf_text_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/vz_projector_d3v4",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["demo.html"],
+    path = "/tf-tensorboard",
+    deps = [
+        ":tf_tensorboard_d3v4",
+        "//tensorflow/tensorboard/demo:demo_data",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+web_library(
+    name = "dist",
+    srcs = ["dist.html"],
+    path = "/tf-tensorboard",
+    deps = [
+        ":tf_tensorboard_d3v4",
+        "//tensorflow/tensorboard/demo:demo_data",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_html_binary(
+    name = "index",
+    # input_path = "/tf-dashboard-common/tf-chart-scaffold.html",
+    input_path = "/tf-tensorboard/dist.html",
+    output_path = "/index.html",
+    deps = [":dist"],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["autoReloadBehavior.ts"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/autoReloadBehavior.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/autoReloadBehavior.ts
new file mode 100644
index 0000000000000000000000000000000000000000..1f6b4cf6419e12aeff0c261a01622e23825b59f4
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/autoReloadBehavior.ts
@@ -0,0 +1,60 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module TF.TensorBoard {
+  export var AUTORELOAD_LOCALSTORAGE_KEY = 'TF.TensorBoard.autoReloadEnabled';
+
+  var getAutoReloadFromLocalStorage: () => boolean = () => {
+    var val = window.localStorage.getItem(AUTORELOAD_LOCALSTORAGE_KEY);
+    return val === 'true' || val == null;  // defaults to true
+  };
+
+  export var AutoReloadBehavior = {
+    properties: {
+      autoReloadEnabled: {
+        type: Boolean,
+        observer: '_autoReloadObserver',
+        value: getAutoReloadFromLocalStorage,
+      },
+      _autoReloadId: {
+        type: Number,
+      },
+      autoReloadIntervalSecs: {
+        type: Number,
+        value: 30,
+      },
+    },
+    detached: function() {
+      window.clearTimeout(this._autoReloadId);
+    },
+    _autoReloadObserver: function(autoReload) {
+      window.localStorage.setItem(AUTORELOAD_LOCALSTORAGE_KEY, autoReload);
+      if (autoReload) {
+        var _this = this;
+        this._autoReloadId = window.setTimeout(
+            this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
+      } else {
+        window.clearTimeout(this._autoReloadId);
+      }
+    },
+    _doAutoReload: function() {
+      if (this.reload == null) {
+        throw new Error('AutoReloadBehavior requires a reload method');
+      }
+      this.reload();
+      this._autoReloadId = window.setTimeout(
+          this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
+    }
+  };
+}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/demo.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/demo.html
new file mode 100644
index 0000000000000000000000000000000000000000..95f5b718a566e46e1a3c6d2e9c0966004872fb19
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/demo.html
@@ -0,0 +1,32 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<title>TensorBoard</title>
+<style>
+  html, body {
+    margin: 0;
+    padding: 0;
+    height: 100%;
+    font-family: "RobotoDraft", "Roboto", sans-serif;
+  }
+</style>
+<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
+<link rel="import" href="tf-tensorboard.html">
+
+<tf-tensorboard demo-dir="/data" use-hash></tf-tensorboard>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/dist.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/dist.html
new file mode 100644
index 0000000000000000000000000000000000000000..89bcfb733ec03a54083eb33828721159ea18c6b1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/dist.html
@@ -0,0 +1,32 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<title>TensorBoard</title>
+<style>
+  html, body {
+    margin: 0;
+    padding: 0;
+    height: 100%;
+    font-family: "RobotoDraft", "Roboto", sans-serif;
+  }
+</style>
+<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
+<link rel="import" href="tf-tensorboard.html">
+<body>
+<tf-tensorboard use-hash></tf-tensorboard>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/autoReloadTests.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/autoReloadTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..0f049d40ab66a3f45912fcb92094c19436993ed9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/autoReloadTests.ts
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+declare function fixture(id: string): void;
+window.HTMLImports.whenReady(() => {
+  Polymer({
+    is: 'autoreload-test-element',
+    behaviors: [TF.TensorBoard.AutoReloadBehavior],
+  });
+
+  describe('autoReload-behavior', function() {
+    var testElement;
+    var ls = window.localStorage;
+    var key = TF.TensorBoard.AUTORELOAD_LOCALSTORAGE_KEY;
+    var clock;
+    var callCount: number;
+
+    beforeEach(function() {
+      ls.setItem(key, 'false');  // start it turned off so we can mutate fns
+      testElement = fixture('autoReloadFixture');
+      callCount = 0;
+      testElement.reload = function() { callCount++; };
+    });
+
+    before(function() { clock = sinon.useFakeTimers(); });
+
+    after(function() { clock.restore(); });
+
+    it('reads and writes autoReload state from localStorage', function() {
+      ls.removeItem(key);
+      testElement = fixture('autoReloadFixture');
+      chai.assert.isTrue(
+          testElement.autoReloadEnabled, 'autoReload defaults to true');
+      chai.assert.equal(ls.getItem(key), 'true', 'autoReload setting saved');
+      testElement = fixture('autoReloadFixture');
+      chai.assert.isTrue(
+          testElement.autoReloadEnabled, 'read true from localStorage');
+      testElement.autoReloadEnabled = false;
+      chai.assert.equal(ls.getItem(key), 'false', 'autoReload setting saved');
+      testElement = fixture('autoReloadFixture');
+      chai.assert.isFalse(
+          testElement.autoReloadEnabled, 'read false setting properly');
+      testElement.autoReloadEnabled = true;
+      chai.assert.equal(ls.getItem(key), 'true', 'saved true setting');
+    });
+
+    it('reloads every interval secs when autoReloading', function() {
+      testElement.autoReloadIntervalSecs = 1;
+      testElement.autoReloadEnabled = true;
+      clock.tick(1000);
+      chai.assert.equal(callCount, 1, 'ticking clock triggered call');
+      clock.tick(20 * 1000);
+      chai.assert.equal(callCount, 21, 'ticking clock 20s triggered 20 calls');
+    });
+
+    it('can cancel pending autoReload', function() {
+      testElement.autoReloadIntervalSecs = 10;
+      testElement.autoReloadEnabled = true;
+      clock.tick(5 * 1000);
+      testElement.autoReloadEnabled = false;
+      clock.tick(20 * 1000);
+      chai.assert.equal(callCount, 0, 'callCount is 0');
+    });
+
+    it('throws an error in absence of reload method', function() {
+      testElement.reload = undefined;
+      testElement.autoReloadIntervalSecs = 1;
+      testElement.autoReloadEnabled = true;
+      chai.assert.throws(function() {
+        clock.tick(5000);
+      });
+    });
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/tests.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.html
similarity index 79%
rename from tensorflow/tensorboard/components/tf_color_scale_d3v4/tests.html
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.html
index 9a2a174349c1160ecc19913b560cc7e2ba00a47b..5efc02ef98abbde1399db1f7d477b5b27593c7f3 100644
--- a/tensorflow/tensorboard/components/tf_color_scale_d3v4/tests.html
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.html
@@ -18,14 +18,14 @@ limitations under the License.
 
 <html>
 <head>
-  <meta charset="utf-8">
-  <script src="../../web-component-tester/browser.js"></script>
   <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+  <script src="../../web-component-tester/browser.js"></script>
   <link rel="import" href="../../tf-imports/d3.html">
+  <link rel="import" href="../tf-tensorboard.html">
+  <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
 </head>
 <body>
-    <script src="../colorScale.js"></script>
-    <script src="../palettes.js"></script>
-    <script src="colorScaleTests.js"></script>
+  <tf-tensorboard demo-dir="data/"></tf-tensorboard>
+  <script src="e2eTests.js"></script>
 </body>
 </html>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..2308298ced9c49588012f5731e0ab8825b3d886a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.ts
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+describe('end-to-end test', () => {
+  window.HTMLImports.whenReady(() => {
+    let tb = d3.select('tf-tensorboard');
+    var tabs = (<any>tb.node()).$.tabs;
+
+    function testTab(tabIndex: number) {
+      it(`selecting ${TF.Globals.TABS[tabIndex]} tab`, done => {
+        // Every dashboard emits a rendered event when it is done rendering.
+        tb.on('rendered', () => done());
+        tabs.set('selected', tabIndex);
+      });
+    }
+    // Listen for when the default tab has rendered and test other tabs after.
+    tb.on('rendered', () => {
+      // The default tab already rendered. Test everything else.
+      // If a bug happened while rendering the default tab, the test would
+      // have failed. Re-selecting the default tab and listening for
+      // "rendered" event won't work since the content is not re-stamped.
+      let selected = +tabs.get('selected');
+      for (let i = 0; i < TF.Globals.TABS.length; i++) {
+        if (i !== selected) {
+          testTab(i);
+        }
+      }
+    });
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.html
new file mode 100644
index 0000000000000000000000000000000000000000..88bb6edc4828b4099e892ca323278580aaf6d15e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.html
@@ -0,0 +1,31 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+<head>
+  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+  <script src="../../web-component-tester/browser.js"></script>
+  <link rel="import" href="../../tf-imports/d3.html">
+  <link rel="import" href="../tf-tensorboard.html">
+  <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
+</head>
+<body>
+  <tf-tensorboard demo-dir="data/"></tf-tensorboard>
+  <script src="fastTabSwitch.js"></script>
+</body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.ts
new file mode 100644
index 0000000000000000000000000000000000000000..4dd62a0c3822947741bea108b229c6bd4e3f2783
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.ts
@@ -0,0 +1,44 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+describe('fast tab switch', () => {
+  window.HTMLImports.whenReady(() => {
+    let tb = d3.select('tf-tensorboard');
+    var tabs = (<any>tb.node()).$.tabs;
+
+    // This test will select the events tab. Once the events tab
+    // renders, will select the graph tab, and immediately select
+    // the images tab wihout waiting for the graph tab to finish
+    // rendering. Finally, it finishes when the images tab
+    // has rendered and no errors were thrown.
+    let eventsTabIndex = TF.Globals.TABS.indexOf('events');
+    let imagesTabIndex = TF.Globals.TABS.indexOf('images');
+    let graphTabIndex = TF.Globals.TABS.indexOf('graphs');
+
+    // Listen for when the events tab rendered.
+    tb.on('rendered', () => {
+      it('switching to graph tab and immediately to images', done => {
+        // Select the graph tab.
+        tabs.set('selected', graphTabIndex);
+        // Interrupt graph rendering by immediately selecting the images tab
+        // and finish when the images tab has rendered.
+        tb.on('rendered', () => done());
+        tabs.set('selected', imagesTabIndex);
+      });
+    });
+    // Select the events tab.
+    tabs.set('selected', eventsTabIndex);
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/index.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..8806f36fad91d26b39fe299246578aaf5a776c61
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/index.html
@@ -0,0 +1,35 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+<head>
+  <meta charset="utf-8">
+  <script src="../../web-component-tester/browser.js"></script>
+</head>
+<body>
+<script>
+// Run the tests for each main component in tensorboard.
+WCT.loadSuites([
+  'tensorboardTests.html',
+  // TODO: re-enable or remove. b/30163860
+  // 'e2eTests.html',
+  'fastTabSwitch.html'
+]);
+</script>
+</body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.html
new file mode 100644
index 0000000000000000000000000000000000000000..2122cb79b16bc91a03a5765cfe9867c608752d0c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.html
@@ -0,0 +1,44 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+<head>
+  <link rel="import" href="../../polymer/polymer.html">
+  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+  <script src="../../web-component-tester/browser.js"></script>
+  <link rel="import" href="../tf-tensorboard.html">
+  <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
+</head>
+<body>
+  <test-fixture id="tensorboardFixture">
+    <template>
+      <tf-tensorboard>
+        <span id="inject-me">Injected content should be rendered by the element.</span>
+      </tf-tensorboard>
+    </template>
+  </test-fixture>
+
+  <test-fixture id="autoReloadFixture">
+    <template>
+      <autoreload-test-element></autoreload-test-element>
+    </template>
+  </test-fixture>
+  <script src="tensorboardTests.js"></script>
+  <script src="autoReloadTests.js"></script>
+</body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..3c7fe2c9e7255494d936aa8d18bf56d0bcfc4d80
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.ts
@@ -0,0 +1,136 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+describe('tf-tensorboard tests', () => {
+  window.HTMLImports.whenReady(() => {
+    let tensorboard: any;
+    beforeEach(function() {
+      tensorboard = fixture('tensorboardFixture');
+      tensorboard.demoDir = 'data';
+      tensorboard.autoReloadEnabled = false;
+    });
+
+    it('specified tabs are correct', function(done) {
+      setTimeout(function() {
+        let tabs = tensorboard.$.tabs.getElementsByTagName('paper-tab');
+        let tabMode = Array.prototype.map.call(tabs, (x) => x.dataMode);
+        chai.assert.deepEqual(tabMode, TF.Globals.TABS, 'mode is correct');
+        let tabText =
+            Array.prototype.map.call(tabs, (x) => x.innerText.toLowerCase());
+        chai.assert.deepEqual(tabText, TF.Globals.TABS, 'text is correct');
+        done();
+      });
+    });
+
+    it('respects router manually provided', function() {
+      let router = TF.Backend.router('data', true);
+      tensorboard.router = router;
+      tensorboard.demoDir = null;
+      chai.assert.equal(tensorboard._backend.router, router);
+    });
+
+    it('renders injected content', function() {
+      let injected = tensorboard.querySelector('#inject-me');
+      chai.assert.isNotNull(injected);
+    });
+
+    describe('reloading the selected dashboard', function() {
+      TF.Globals.TABS.forEach((name, tabIndex) => {
+        // These tabs do not support reload mode.
+        if (name === 'graphs' || name === 'projections') {
+          return;
+        }
+        it(`${name}: calling reload reloads dashboard`, function(done) {
+          tensorboard.$.tabs.set('selected', tabIndex);
+          setTimeout(function() {
+            let called = false;
+            tensorboard.selectedDashboard().reload = function() {
+              called = true;
+            };
+            tensorboard.reload();
+            chai.assert.isFalse(
+                tensorboard.$$('#reload-button').disabled,
+                'reload button not disabled');
+            chai.assert.isTrue(called, `reload was called`);
+            done();
+          });
+        });
+      });
+    });
+
+    it('reload is disabled for graph dashboard', function(done) {
+      let idx = TF.Globals.TABS.indexOf('graphs');
+      chai.assert.notEqual(idx, -1, 'graphs was found');
+      tensorboard.$.tabs.set('selected', idx);
+      setTimeout(
+          function() {  // async so that the queued tab change will happen
+            let called = false;
+            tensorboard.selectedDashboard().reload = function() {
+              called = true;
+            };
+            tensorboard.reload();
+            chai.assert.isTrue(
+                tensorboard.$$('#reload-button').disabled,
+                'reload button disabled');
+            chai.assert.isFalse(called, `reload was not called`);
+            done();
+          });
+    });
+
+    describe('top right global icons', function() {
+      it('Clicking the reload button will call reload', function() {
+        let called = false;
+        tensorboard.reload = function() { called = true; };
+        tensorboard.$$('#reload-button').click();
+        chai.assert.isTrue(called);
+      });
+
+      it('settings pane is hidden', function() {
+        chai.assert.equal(tensorboard.$.settings.style['display'], 'none');
+      });
+
+      it('settings icon button opens the settings pane', function(done) {
+        tensorboard.$$('#settings-button').click();
+        // This test is a little hacky since we depend on polymer's
+        // async behavior, which is difficult to predict.
+
+        // keep checking until the panel is visible. error with a timeout if it
+        // is broken.
+        function verify() {
+          if (tensorboard.$.settings.style['display'] !== 'none') {
+            done();
+          } else {
+            setTimeout(verify, 3);  // wait and see if it becomes true
+          }
+        }
+        verify();
+      });
+
+      it('Autoreload checkbox toggle works', function() {
+        let checkbox = tensorboard.$$('#auto-reload-checkbox');
+        chai.assert.equal(checkbox.checked, tensorboard.autoReloadEnabled);
+        let oldValue = checkbox.checked;
+        checkbox.click();
+        chai.assert.notEqual(oldValue, checkbox.checked);
+        chai.assert.equal(checkbox.checked, tensorboard.autoReloadEnabled);
+      });
+
+      it('Autoreload checkbox contains correct interval info', function() {
+        let checkbox = tensorboard.$$('#auto-reload-checkbox');
+        let timeInSeconds = tensorboard.autoReloadIntervalSecs + 's';
+        chai.assert.include(checkbox.innerText, timeInSeconds);
+      });
+    });
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/tf-tensorboard.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/tf-tensorboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..f3f5d590f993b69acf2625e96c47f1e1c8ece384
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/tf-tensorboard.html
@@ -0,0 +1,351 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../iron-icons/iron-icons.html">
+<link rel="import" href="../paper-tabs/paper-tabs.html">
+<link rel="import" href="../paper-dialog/paper-dialog.html">
+<link rel="import" href="../paper-checkbox/paper-checkbox.html">
+<link rel="import" href="../paper-toolbar/paper-toolbar.html">
+<link rel="import" href="../paper-button/paper-button.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../paper-header-panel/paper-header-panel.html">
+<link rel="import" href="../tf-globals/tf-globals.html">
+<link rel="import" href="../tf-scalar-dashboard/tf-scalar-dashboard.html">
+<link rel="import" href="../tf-distribution-dashboard/tf-distribution-dashboard.html">
+<link rel="import" href="../tf-histogram-dashboard/tf-histogram-dashboard.html">
+<link rel="import" href="../tf-image-dashboard/tf-image-dashboard.html">
+<link rel="import" href="../tf-audio-dashboard/tf-audio-dashboard.html">
+<!-- <link rel="import" href="../tf-graph-dashboard/tf-graph-dashboard.html"> -->
+<link rel="import" href="../tf-text-dashboard/tf-text-dashboard.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+<link rel="import" href="../tf-storage/tf-storage.html">
+<link rel="import" href="../vz-projector/vz-projector-dashboard.html">
+
+<!--
+tf-tensorboard is the frontend entry point for TensorBoard.
+
+It implements a toolbar (via paper-header-panel and paper-toolbar) that
+allows the user to toggle between various dashboards.
+-->
+<script src="autoReloadBehavior.js"></script>
+<dom-module id="tf-tensorboard">
+  <template>
+    <paper-dialog with-backdrop id="settings">
+      <h2>Settings</h2>
+      <paper-checkbox id="auto-reload-checkbox" checked="{{autoReloadEnabled}}">
+        Reload data every <span>[[autoReloadIntervalSecs]]</span>s.
+      </paper-checkbox>
+    </paper-dialog>
+    <paper-header-panel>
+      <paper-toolbar id="toolbar">
+        <div id="toolbar-content">
+          <div class="toolbar-title">TensorBoard</div>
+          <paper-tabs selected="{{modeIndex}}" noink class="tabs" id="tabs">
+            <template is="dom-repeat" items="[[tabs]]">
+              <template is="dom-if" if="[[_isTabEnabled(item)]]">
+                <paper-tab data-mode="[[item]]">[[item]]</paper-tab>
+              </template>
+            </template>
+          </paper-tabs>
+          <div class="global-actions">
+            <paper-icon-button
+              icon="refresh"
+              on-tap="reload"
+              disabled$="[[_isReloadDisabled(mode)]]"
+              id="reload-button"
+            ></paper-icon-button>
+            <paper-icon-button
+              icon="settings"
+              on-tap="openSettings"
+              id="settings-button"
+            ></paper-icon-button>
+            <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md" tabindex="-1">
+              <paper-icon-button icon="help-outline"></paper-icon-button>
+            </a>
+          </div>
+        </div>
+      </paper-toolbar>
+
+      <div id="content" class="fit">
+        <content id="injected-overview"></content>
+
+        <template is="dom-if" if="[[_modeIsScalars(mode)]]">
+          <tf-scalar-dashboard
+            id="scalars"
+            backend="[[_backend]]"
+            router="[[router]]"
+          ></tf-scalar-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsImages(mode)]]">
+          <tf-image-dashboard
+            id="images"
+            backend="[[_backend]]"
+          ></tf-image-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsAudio(mode)]]">
+          <tf-audio-dashboard
+            id="audio"
+            backend="[[_backend]]"
+          ></tf-audio-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsGraphs(mode)]]">
+          <tf-graph-dashboard
+            id="graphs"
+            backend="[[_backend]]"
+            debugger-data-enabled="[[_debuggerDataEnabled]]"
+          ></tf-graph-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsDistributions(mode)]]">
+          <tf-distribution-dashboard
+            id="distributions"
+            backend="[[_backend]]"
+          ></tf-distribution-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsHistograms(mode)]]">
+          <tf-histogram-dashboard
+            id="histograms"
+            backend="[[_backend]]"
+          ></tf-histogram-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsEmbeddings(mode)]]">
+          <vz-projector-dashboard
+            id="projector"
+            route-prefix="/data/plugin/projector">
+          </vz-projector-dashboard>
+        </template>
+
+        <template is="dom-if" if="[[_modeIsText(mode)]]">
+          <tf-text-dashboard
+            id="text"
+            backend="[[_backend]]">
+          </tf-text-dashboard>
+        </template>
+      </div>
+    </paper-header-panel>
+
+    <style>
+      :host {
+        height: 100%;
+        display: block;
+        background-color: var(--paper-grey-100);
+      }
+
+      #toolbar {
+        background-color: var(--tb-toolbar-background-color, --tb-orange-strong);
+        -webkit-font-smoothing: antialiased;
+      }
+
+      .toolbar-title {
+        font-size: 20px;
+        margin-left: 10px;
+        text-rendering: optimizeLegibility;
+        letter-spacing: -0.025em;
+        font-weight: 500;
+        flex-grow: 2;
+        display: var(--tb-toolbar-title-display, block);
+      }
+
+      .tabs {
+        flex-grow: 1;
+        text-transform: uppercase;
+        height: 100%;
+      }
+
+      paper-tabs {
+        --paper-tabs-selection-bar-color: white;
+      }
+
+      .global-actions {
+        flex-grow: 2;
+        display: inline-flex; /* Ensure that icons stay aligned */
+        justify-content: flex-end;
+        text-align: right;
+        color: white;
+      }
+
+      .global-actions a {
+        color: white;
+      }
+
+      #toolbar-content {
+        width: 100%;
+        height: 100%;
+        display: flex;
+        flex-direction: row;
+        justify-content: space-between;
+        align-items: center;
+      }
+
+      #content {
+        height: 100%;
+      }
+
+      [disabled] {
+        opacity: 0.2;
+        color: white;
+      }
+
+    </style>
+  </template>
+  <script>
+    "use strict";
+
+    Polymer({
+      is: "tf-tensorboard",
+      behaviors: [TF.TensorBoard.AutoReloadBehavior],
+      properties: {
+        router: {
+          type: Object,
+          value: function() {
+            return TF.Backend.router();
+          },
+        },
+        _backend: {
+          type: Object,
+          computed: "_makeBackend(router, demoDir)",
+        },
+        _debuggerDataEnabled: {
+          type: Boolean,
+          value: function() {
+            // For now, Tensorboard only shows debugger data if the debugger_data GET param is set
+            // to enabled.
+            let match = window.location.href.match(/[&\?]debugger_data=enabled/);
+            return match && match.length == 1;
+          },
+        },
+        // Which tab is selected (scalars, graph, images etc).
+        mode: {
+          type: String,
+          computed: '_getModeFromIndex(modeIndex)',
+          notify: true,
+        },
+        tabs: {
+          type: Array,
+          readOnly: true,
+          value: TF.Globals.TABS,
+        },
+        // If this is set to a string, TensorBoard will switch to "demo mode"
+        // and attempt to load serialized json data from that directory. You can
+        // generate conformant json using
+        // tensorboard/scripts/serialize_tensorboard.py
+        demoDir: {
+          type: String,
+          value: null,
+        },
+        // Set this to true to store state in URI hash. Should be true for all non-test purposes.
+        useHash: {
+          type: Boolean,
+          value: false,
+        },
+        disabledTabs: String,
+      },
+      _isTabEnabled: function(tab) {
+        if (this.disabledTabs != null &&
+            this.disabledTabs.split(',').indexOf(tab) >= 0) {
+          return false;
+        }
+        return true;
+      },
+      _getModeFromIndex: function(modeIndex) {
+        var mode = this.tabs[modeIndex];
+        TF.URIStorage.setString(TF.URIStorage.TAB, mode);
+        return mode;
+      },
+      _makeBackend: function(router, demoDir) {
+        // use the demoDir if it is set, otherwise use the provided router
+        if (demoDir != null) {
+          router = TF.Backend.router(demoDir, true);
+        }
+        return new TF.Backend.Backend(router);
+      },
+      _isReloadDisabled: function(mode) {
+        return !this._debuggerDataEnabled && this._modeIsGraphs(mode);
+      },
+      _modeIsScalars: function(mode) {
+        return mode === "scalars";
+      },
+      _modeIsImages: function(mode) {
+        return mode === "images";
+      },
+      _modeIsAudio: function(mode) {
+        return mode === "audio";
+      },
+      _modeIsGraphs: function(mode) {
+        return mode === "graphs";
+      },
+      _modeIsEmbeddings: function(mode) {
+        return mode === "embeddings";
+      },
+      _modeIsDistributions: function(mode) {
+        return mode === "distributions";
+      },
+      _modeIsHistograms: function(mode) {
+        return mode === "histograms";
+      },
+      _modeIsText: function(mode) {
+        return mode === "text";
+      },
+      selectedDashboard: function() {
+        var dashboard = this.$$("#" + this.mode);
+        if (dashboard == null) {
+          throw new Error(`Unable to find dashboard for mode: ${this.mode}`);
+        }
+        return dashboard;
+      },
+      ready: function() {
+        TF.Globals.USE_HASH = this.useHash;
+
+        this._getModeFromHash();
+        window.addEventListener('hashchange', function() {
+          this._getModeFromHash();
+        }.bind(this));
+      },
+      _getModeFromHash: function() {
+        var tabName = TF.URIStorage.getString(TF.URIStorage.TAB);
+        var modeIndex = this.tabs.indexOf(tabName);
+        if (modeIndex == -1 && this.modeIndex == null) {
+          // Select the first tab as default.
+          this.set('modeIndex', 0);
+        }
+        if (modeIndex != -1 && modeIndex != this.modeIndex) {
+          this.set('modeIndex', modeIndex);
+        }
+      },
+      reload: function() {
+        if (this._modeIsEmbeddings(this.mode)) {
+          return;
+        }
+        if (!this._debuggerDataEnabled && this._modeIsGraphs(this.mode)) {
+          return;
+        }
+        this.selectedDashboard().reload();
+      },
+      openSettings: function() {
+        this.$.settings.open();
+      },
+    });
+  </script>
+  <script src="autoReloadBehavior.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7f4c27c6aa82a16a4a7cc3acb4e9ce7bf2f1c2dc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/BUILD
@@ -0,0 +1,60 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_text_dashboard",
+    srcs = [
+        "tf-text-dashboard.html",
+        "tf-text-loader.html",
+    ],
+    path = "/tf-text-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "@org_polymer",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_material",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-text-dashboard.html",
+        "tf-text-loader.html",
+    ],
+    destdir = "tf-text-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend:legacy",
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//third_party/javascript/polymer/v1/paper-material:lib",
+    ],
+)
+
+# This is needed: components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..af13d276cd3b6b37031fa11081d6afe0a5fd2b45
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/demo/BUILD
@@ -0,0 +1,25 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_text_dashboard/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-text-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_text_dashboard",
+        "//tensorflow/tensorboard/components/tf_text_dashboard/demo/data",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6da08e5c30e5e7e233c852712b61905a7defab56
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "data",
+    srcs = glob(["*"]),
+    path = "/tf-text-dashboard/demo/data/plugin/text",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c15cf2fdc4a7b8c78a9c4ac80599402df458b7a1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/BUILD
@@ -0,0 +1,45 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_text_dashboard_d3v4",
+    srcs = [
+        "tf-text-dashboard.html",
+        "tf-text-loader.html",
+    ],
+    path = "/tf-text-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "@org_polymer",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_material",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-text-dashboard",
+    deps = [
+        ":tf_text_dashboard_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/logdir b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/logdir
new file mode 100644
index 0000000000000000000000000000000000000000..c7d82022cc061502c5991a22e72c214918a9f87b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/runs.json b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/runs.json
new file mode 100644
index 0000000000000000000000000000000000000000..aea7de5f91725ab58e9770b9b6fb60ad672fada0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/runs.json
@@ -0,0 +1 @@
+{"fry": ["message", "markdown"], "leela": ["message"]}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_markdown.json b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_markdown.json
new file mode 100644
index 0000000000000000000000000000000000000000..94183ae13d1be1f25abf89572841be0db5d1dfe1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_markdown.json
@@ -0,0 +1,32 @@
+[
+  {
+    "wall_time": 1489715207.593146,
+    "step": 0,
+    "text": "<p><em>Italics1</em> <em>Italics2</em> <strong>bold1</strong> <strong>bold2</strong></p>"
+  },
+  {
+    "wall_time": 1489715207.593801,
+    "step": 1,
+    "text": "<ol>\n<li>List item one.</li>\n<li>List item two.</li>\n<li>Sublist</li>\n<li>Sublist2</li>\n<li>List continues.</li>\n</ol>"
+  },
+  {
+    "wall_time": 1489715207.594842,
+    "step": 2,
+    "text": "<table>\n<thead>\n<tr>\n<th>An</th>\n<th>Example</th>\n<th>Table</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>A</td>\n<td>B</td>\n<td>C</td>\n</tr>\n<tr>\n<td>1</td>\n<td>2</td>\n<td>3</td>\n</tr>\n</tbody>\n</table>"
+  },
+  {
+    "wall_time": 1489715207.595761,
+    "step": 3,
+    "text": "<p>hello <a><em>you</em></a></p>"
+  },
+  {
+    "wall_time": 1489715207.595761,
+    "step": 4,
+    "text": "<p><a href=\"http://tensorflow.org\">TensorFlow</a></p>"
+  },
+  {
+    "wall_time": 1489715207.595761,
+    "step": 530234352,
+    "text": "&lt;script&gt;alert('xss')&lt;/script&gt;"
+  }
+]
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_message.json b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_message.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8cc006c0d0223795d646bf5245ae56e54329fa0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_message.json
@@ -0,0 +1,22 @@
+[
+  {
+    "wall_time": 1489715207.593146,
+    "step": 0,
+    "text": "fry loves garnet"
+  },
+  {
+    "wall_time": 1489715207.593801,
+    "step": 1,
+    "text": "fry loves amethyst"
+  },
+  {
+    "wall_time": 1489715207.594842,
+    "step": 2,
+    "text": "fry loves pearl"
+  },
+  {
+    "wall_time": 1489715207.595761,
+    "step": 3,
+    "text": "fry loves steven"
+  }
+]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_leela_tag_message.json b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_leela_tag_message.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a6d2598937b4e16b5420cac9423cfdd8b16ff48
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_leela_tag_message.json
@@ -0,0 +1,22 @@
+[
+  {
+    "step": 0,
+    "wall_time": 1489715207.607792,
+    "text": "leela loves garnet and feels strongly about various issues of the day including the two-cent titanium tax and whether nixon's head contributes to greenhouse gas emissions"
+  },
+  {
+    "step": 1,
+    "wall_time": 1489715207.609011,
+    "text": "leela loves amethyst"
+  },
+  {
+    "step": 2,
+    "wall_time": 1489715207.610028,
+    "text": "leela loves pearl"
+  },
+  {
+    "step": 3,
+    "wall_time": 1489715207.611142,
+    "text": "leela loves someverylongwordwithoutanybreaksorspacessowecanseehowthatishandledbythefrontend"
+  }
+]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/index.html b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..77d19b948c9d1844f6e6c2990075f14e1a6e6347
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/index.html
@@ -0,0 +1,68 @@
+<!doctype html>
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+  <head>
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="tf-text-dashboard.html">
+    <title>text Dashboard Demo</title>
+    <style>
+      #container{
+        height: 800px;
+        border: 2px solid grey;
+      }
+      html, body {
+        margin: 0;
+        padding: 0;
+        height: 100%;
+        font-family: "RobotoDraft","Roboto",sans-serif;
+      }
+    </style>
+  </head>
+  <body>
+    <demo-snippet>
+      <template>
+        <dom-module id="text-dash-demo">
+          <template>
+            <tf-text-dashboard id="demo" backend="[[backend]]">
+            </tf-text-dashboard>
+          </template>
+          <script>
+            Polymer({
+              is: "text-dash-demo",
+              properties: {
+                backend: {
+                  type: Object,
+                  value: function() {
+                    var path = "data";
+                    var router = new TF.Backend.router(path, true);
+                    return new TF.Backend.Backend(router);
+                  },
+                },
+              },
+            });
+          </script>
+        </dom-module>
+        <div id="container">
+          <text-dash-demo></text-dash-demo>
+        </div>
+      </template>
+    </demo-snippet>
+  </body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-dashboard.html b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-dashboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..4c0b34055d09f1b87e0eaf2faefc6d349c8c94c9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-dashboard.html
@@ -0,0 +1,109 @@
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-dialog/paper-dialog.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+<link rel="import" href="../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
+<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
+<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
+<link rel="import" href="tf-text-loader.html">
+
+<!--
+tf-text-dashboard displays a dashboard that loads texts from a TensorFlow run.
+-->
+<dom-module id="tf-text-dashboard">
+  <template>
+    <paper-dialog with-backdrop id="actual-text-size-dialog"></paper-dialog>
+    <div id="plumbing">
+      <tf-color-scale
+        id="colorScale"
+        runs="[[runs]]"
+        out-color-scale="{{_colorScale}}"
+        ></tf-color-scale>
+    </div>
+
+    <tf-dashboard-layout>
+      <div class="sidebar">
+        <tf-sidebar-helper
+          backend="[[backend]]"
+          categories="{{_categories}}"
+          color-scale="[[_colorScale]]"
+          run2tag="[[run2tag]]"
+          runs="[[runs]]"
+          selected-runs="{{_selectedRuns}}"
+          >
+        </tf-sidebar-helper>
+      </div>
+      <div class="center">
+        <tf-panes-helper
+          categories="[[_categories]]"
+          color-scale="[[_colorScale]]"
+          data-type="[[dataType]]"
+          data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
+          run2tag="[[run2tag]]"
+          selected-runs="[[_selectedRuns]]"
+          repeat-for-runs
+          >
+          <template>
+            <tf-text-loader color-scale="[[_colorScale]]"></tf-text-loader>
+          </template>
+        </tf-panes-helper>
+      </div>
+    </tf-dashboard-layout>
+    <style include="dashboard-style"></style>
+    <style>
+      tf-panes-helper {
+        --card-width: 100%;
+        --card-height: auto;
+        --card-expanded-width: 100%;
+        --card-expanded-height: 1000px;
+        --card-padding: 0 5px 5px 5px;
+        --show-expand-button: none;
+      }
+
+    </style>
+  </template>
+  <script>
+    TF.Dashboard.TfTextDashboard = Polymer({
+      is: "tf-text-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
+      properties: {
+        backend: Object,
+        dataType: {
+          type: String,
+          value: "text"
+        },
+      },
+      behaviors: [
+        TF.Dashboard.DashboardBehavior("text"),
+        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
+        TF.Backend.BackendBehavior,
+      ],
+      attached: function() {
+        this.async(function() {
+          this.fire("rendered");
+        });
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-loader.html b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-loader.html
new file mode 100644
index 0000000000000000000000000000000000000000..374e0478dd19d6cd667e293bb5acca487de2cad8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-loader.html
@@ -0,0 +1,143 @@
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-material/paper-material.html">
+<link rel="import" href="../tf-dashboard-common/scrollbar-style.html">
+<link rel="import" href="../tf-imports/d3.html">
+
+<!--
+tf-text-loader displays markdown text data from the Text plugin.
+-->
+
+<style>
+  tf-text-loader p {
+    margin: 0.3em 0;
+  }
+
+  tf-text-loader table {
+    border-collapse: collapse;
+  }
+
+  tf-text-loader table th {
+    font-weight: 600;
+  }
+
+  tf-text-loader table th,
+  tf-text-loader table td {
+    padding: 6px 13px;
+    border: 1px solid #dfe2e5;
+  }
+
+  tf-text-loader table tr {
+    background-color: #fff;
+    border-top: 1px solid #c6cbd1;
+  }
+
+</style>
+<dom-module id="tf-text-loader">
+
+  <!-- Set the innerHTML with the textual content, so we can render the
+   html generated by our markdown parser. Note this content is always
+   sanitized by the backend, so xss attacks are not possible.
+  -->
+  <template>
+    <style include="scrollbar-style"></style>
+    <paper-material elevation="1" id="outer" class="container scrollbar">
+      <template id="repeater" is="dom-repeat" items="[[_texts]]">
+      <paper-material elevation="1" class="step-container">
+        step <span class="step-value">[[_numfmt(item.step)]]</span>
+      </paper-material>
+      <paper-material elevation="1" inner-h-t-m-l="[[item.text]]" class="text">
+        </paper-material>
+      </template>
+    </paper-material>
+
+
+    <style>
+      #outer {
+        display: block;
+        overflow: auto;
+        max-height: 500px;
+        position: relative;
+        border-radius: 3px;
+        border: 2px solid black;
+      }
+      .text {
+        margin: 0 10px 10px 10px;
+        border-radius: 0 3px 3px 3px;
+        background-color: white;
+        padding: 5px;
+        word-break: break-word;
+      }
+      .step-container {
+        border-left: 1px solid #ccc;
+        border-right: 1px solid #ccc;
+        border-top: 1px solid #ccc;
+        border-radius: 3px 3px 0 0;
+        font-style: italic;
+        margin-top: 10px;
+        background-color: var(--tb-ui-light-accent);
+        display: inline-block;
+        margin-left: 9px;
+        padding: 3px;
+        font-size: 12px;
+      }
+
+    </style>
+
+  </template>
+  <script>
+    Polymer({
+      is: "tf-text-loader",
+      properties: {
+        colorScale: Object,
+        run: String,
+        // This is an array of Tensorboard Text&Datum objects (See backend.ts for details). The
+        // properties of objects in this array are
+        // {
+        //   wall_time: Date,
+        //   step: number,
+        //   text: string,
+        // }
+        // they are ordered from most recent to oldest
+        _texts: {
+          type: Array,
+          value: [],
+        },
+
+      },
+      redraw: function() {
+        // Other dashboards logic requires a redraw method to be defined.
+      },
+      setVisibleSeries: function(runs) {
+        // Do nothing.
+      },
+      setSeriesData: function(run, texts) {
+        this.set("run", run);
+        this.set("_texts", texts.reverse());
+
+        // Update the border color based on the run.
+        var color = this.colorScale.scale(run);
+        this.$$("#outer").style.borderColor = color;
+      },
+      _numfmt: function(n) {
+        return d3.format(",")(n);
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_data_summary/BUILD b/tensorflow/tensorboard/components/vz_data_summary/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a4ba0c089c94425d6613b334c05495d38772c601
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_data_summary/BUILD
@@ -0,0 +1,91 @@
+package(default_visibility = ["//visibility:public"])
+
+load(
+    "//tensorflow/tensorboard:defs.bzl",
+    "tensorboard_ts_config",
+    "tensorboard_ts_declaration",
+    "tensorboard_ts_development_sources",
+    "tensorboard_ts_devserver",
+    "tensorboard_ts_library",
+    "tensorboard_webcomponent_library",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+tensorboard_webcomponent_library(
+    name = "lib",
+    srcs = ["vz-data-summary.html"],
+    ts_lib_deps = [":ts_lib"],
+    destdir = "vz-data-summary",
+    deps = [
+        "//learning/vis/vz_elements:common",
+        "//third_party/javascript/d3/v3:lib",
+        "//third_party/javascript/polymer/v1/iron-demo-helpers:lib",
+        "//third_party/javascript/polymer/v1/iron-resizable-behavior:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "ts_lib",
+    srcs = ["vz-data-summary.ts"],
+    externs_list = [":externs"],
+    deps = [
+        ":typings",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+    ],
+)
+
+tensorboard_ts_declaration(
+    name = "typings",
+    srcs = ["typings.d.ts"],
+)
+
+# This build rule is used to run the demo.
+tensorboard_ts_devserver(
+    name = "dev_server",
+    manifest = ":dev_sources",
+    serving_path = "/demo_lib_out/vz-data-summary/vz-data-summary.js",
+    static_files = [":demo_lib"],
+    deps = [":tsconfig"],
+)
+
+tensorboard_webcomponent_library(
+    name = "demo_lib",
+    srcs = ["demo.html"],
+    destdir = "vz-data-summary",
+    deps = [
+        ":lib",
+        "//third_party/javascript/d3/v3:lib",
+        "//third_party/javascript/polymer/v1/iron-demo-helpers:lib",
+        "//third_party/javascript/polymer/v1/iron-resizable-behavior:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "demo_ts_lib",
+    srcs = ["demo.ts"],
+    externs_list = [":externs"],
+    deps = [
+        ":ts_lib",
+        "//third_party/javascript/typings/d3",
+    ],
+)
+
+tensorboard_ts_development_sources(
+    name = "dev_sources",
+    deps = [":demo_ts_lib"],
+)
+
+tensorboard_ts_config(
+    name = "tsconfig",
+    deps = [":ts_lib"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tensorboard/components/vz_data_summary/BUILD.OPENSOURCE b/tensorflow/tensorboard/components/vz_data_summary/BUILD.OPENSOURCE
deleted file mode 100644
index 9743d70d947c13edf455b9306e60757f8b104d68..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_data_summary/BUILD.OPENSOURCE
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-# Description:
-# Package for the data-summary vz-element.
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/BUILD b/tensorflow/tensorboard/components/vz_distribution_chart/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3346805fb067f03f70b816cf42c4dce8a9958639
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_distribution_chart/BUILD
@@ -0,0 +1,69 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_distribution_chart",
+    srcs = [
+        "vz-distribution-chart.html",
+        ":ts",
+    ],
+    path = "/vz-distribution-chart",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:plottable",
+        "//tensorflow/tensorboard/components/vz_line_chart",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["vz-distribution-chart.ts"],
+    typings = [
+        "@org_definitelytyped//:d3.d.ts",
+        "@com_palantir_plottable//:plottable.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
+        "//tensorflow/tensorboard/components/vz_line_chart:ts_typings",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "vz-distribution-chart.html",
+        ":legacy_ts",
+    ],
+    visibility = ["//visibility:public"],
+    destdir = "vz-distribution-chart",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = ["vz-distribution-chart.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = [
+        "//tensorflow/tensorboard/components:common_deps",
+        "//tensorflow/tensorboard/components/vz_line_chart:legacy_ts",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/demo/BUILD b/tensorflow/tensorboard/components/vz_distribution_chart/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2c9af4b3dc8d851ff76d432f79df45f6d8128400
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_distribution_chart/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/vz_distribution_chart/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-distribution-chart/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/vz_distribution_chart",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/BUILD b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3ff60d5143a2495cff7259e8d55459c1f58305de
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/BUILD
@@ -0,0 +1,66 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_distribution_chart_d3v4",
+    srcs = [
+        "bundle.js",
+        "vz-distribution-chart.html",
+    ],
+    path = "/vz-distribution-chart",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable",
+        "//tensorflow/tensorboard/components/vz_line_chart_d3v4",
+        "@org_polymer",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-distribution-chart",
+    deps = [
+        ":vz_distribution_chart_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/vz_line_chart_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"VZ": [
+        "vz-distribution-chart.ts",
+    ]},
+    namespace_symbol_aliases = {"VZ": {
+        "Dataset": "Plottable.Dataset",
+        "ChartHelpers": "VZ.ChartHelpers",
+    }},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/demo.html b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/index.html
similarity index 96%
rename from tensorflow/tensorboard/components/vz_distribution_chart_d3v4/demo.html
rename to tensorflow/tensorboard/components/vz_distribution_chart_d3v4/index.html
index 0a1a344056cef09b14d710644dbd35a531cbfd68..39db09354bd527fa90bb05f0d7656991b1d2383a 100644
--- a/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/demo.html
+++ b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/index.html
@@ -21,11 +21,10 @@ limitations under the License.
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>vz-distribution chart demo</title>
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
     <link rel="import" href="vz-distribution-chart.html">
-    <link rel="import" href="iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="paper-styles/typography.html">
-    <script src="bundle.js"></script>
-
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="../paper-styles/typography.html">
     <style type="text/css">
       body {
         font-family: "Roboto";
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.html b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.html
index f274a4c24c0be714337fb441f12ea4c6bdbc3e0c..3c517bd164e8981251ec490410fb6d357221bbac 100644
--- a/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.html
+++ b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.html
@@ -15,8 +15,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<link rel="import" href="polymer/polymer.html">
-<link rel="import" href="plottable-library/plottable_css.html">
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/plottable.html">
+<link rel="import" href="../vz-line-chart/vz-line-chart.html">
 
 <dom-module id="vz-distribution-chart">
   <template>
@@ -40,4 +41,5 @@ limitations under the License.
 
     </style>
   </template>
+  <script src="bundle.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD b/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..d36cf2b33106fadba2aa0045bacb44a497820411
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD
@@ -0,0 +1,50 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_histogram_timeseries",
+    srcs = [
+        "vz-histogram-timeseries.html",
+    ],
+    path = "/vz-histogram-timeseries",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "index.html",
+        "vz-histogram-timeseries.html",
+        ":legacy_ts",
+    ],
+    visibility = ["//visibility:public"],
+    destdir = "vz-histogram-timeseries",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+# This is needed: components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/BUILD b/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8feb1c99260f81d6e358901e7b63b63c04e3c394
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/BUILD
@@ -0,0 +1,25 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/vz_histogram_timeseries/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-histogram-timeseries/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/BUILD b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..30adc0ad3a19f737f0d3bc716c60671b95ad6831
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/BUILD
@@ -0,0 +1,34 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_histogram_timeseries_d3v4",
+    srcs = ["vz-histogram-timeseries.html"],
+    path = "/vz-histogram-timeseries",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-histogram-timeseries",
+    deps = [
+        ":vz_histogram_timeseries_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/demo.html b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/index.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/demo.html
rename to tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/index.html
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/vz-histogram-timeseries.html b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/vz-histogram-timeseries.html
index 88def1710cdd8b4fdd85b4056c043c5871f2460a..bdba230077d48d3602be2213ea0aa18f7d60a5b4 100644
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/vz-histogram-timeseries.html
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/vz-histogram-timeseries.html
@@ -16,7 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports-d3v4/d3.html">
+<link rel="import" href="../tf-imports/d3.html">
 
 <!--
 vz-histogram-timeseries creates an element that draws beautiful histograms for
diff --git a/tensorflow/tensorboard/components/vz_line_chart/BUILD b/tensorflow/tensorboard/components/vz_line_chart/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..130d58dec79a0c70dfcaef2a76affe31e9a51cf6
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_line_chart/BUILD
@@ -0,0 +1,73 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_line_chart",
+    srcs = [
+        "vz-line-chart.html",
+        ":ts",
+    ],
+    path = "/vz-line-chart",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:plottable",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = [
+        "dragZoomInteraction.ts",
+        "vz-chart-helpers.ts",
+        "vz-line-chart.ts",
+    ],
+    typings = [
+        "@org_definitelytyped//:d3.d.ts",
+        "@com_palantir_plottable//:plottable.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "index.html",
+        "vz-line-chart.html",
+        ":legacy_ts",
+    ],
+    visibility = ["//visibility:public"],
+    destdir = "vz-line-chart",
+    deps = [
+        "//tensorflow/tensorboard/components:tf_imports",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [
+        "dragZoomInteraction.ts",
+        "vz-chart-helpers.ts",
+        "vz-line-chart.ts",
+    ],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = ["//tensorflow/tensorboard/components:common_deps"],
+)
diff --git a/tensorflow/tensorboard/components/vz_line_chart/demo/BUILD b/tensorflow/tensorboard/components/vz_line_chart/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..29e12160eac1b1fe09701c1e4be976cdfc181918
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_line_chart/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/vz_line_chart/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-line-chart/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/vz_line_chart",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_line_chart_d3v4/BUILD b/tensorflow/tensorboard/components/vz_line_chart_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f6690c3d7de37d0decd06a6dbd98a216f87b2a9b
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_line_chart_d3v4/BUILD
@@ -0,0 +1,72 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_line_chart_d3v4",
+    srcs = [
+        "bundle.js",
+        "vz-line-chart.html",
+    ],
+    path = "/vz-line-chart",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable",
+        "@org_polymer",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-line-chart",
+    deps = [
+        ":vz_line_chart_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {
+        "VZ.ChartHelpers": [
+            "vz-chart-helpers.ts",
+        ],
+        "VZ": [
+            "vz-line-chart.ts",
+            "dragZoomInteraction.ts",
+        ],
+    },
+    namespace_symbol_aliases = {
+        "VZ.ChartHelpers": {
+            "Dataset": "Plottable.Dataset",
+        },
+    },
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_line_chart_d3v4/demo.html b/tensorflow/tensorboard/components/vz_line_chart_d3v4/index.html
similarity index 99%
rename from tensorflow/tensorboard/components/vz_line_chart_d3v4/demo.html
rename to tensorflow/tensorboard/components/vz_line_chart_d3v4/index.html
index 4293e9791e01b58275fb0c7debaade7ccf41f865..fb571a518370c343156ea158657ec0b68bfe1da2 100644
--- a/tensorflow/tensorboard/components/vz_line_chart_d3v4/demo.html
+++ b/tensorflow/tensorboard/components/vz_line_chart_d3v4/index.html
@@ -21,10 +21,10 @@ limitations under the License.
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>vz-line-chart demo</title>
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
     <link rel="import" href="vz-line-chart.html">
-    <link rel="import" href="iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="paper-styles/typography.html">
-    <script src="bundle.js"></script>
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="../paper-styles/typography.html">
     <style type="text/css">
       body {
         font-family: "Roboto";
diff --git a/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.html b/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.html
index 13b46d8bbf60e1941fb6e95db1620e7eeca58cdf..85e24ae4be0320330ec1567d8522c0d825bddcc7 100644
--- a/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.html
+++ b/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.html
@@ -15,8 +15,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<link rel="import" href="polymer/polymer.html">
-<link rel="import" href="plottable-library/plottable_css.html">
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../tf-imports/plottable.html">
+
 <!--
 vz-line-chart creates an element that draws a line chart for
 displaying event values.
@@ -122,4 +125,5 @@ such as different X scales (linear and temporal), tooltips and smoothing.
 
     </style>
   </template>
+  <script src="bundle.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/BUILD b/tensorflow/tensorboard/components/vz_projector/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0ea94b9a22aa82458ba619609436d807f883bf26
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/BUILD
@@ -0,0 +1,180 @@
+package(default_visibility = [":projector_group"])
+
+load(
+    "//tensorflow/tensorboard:defs.bzl",
+    "tensorboard_karma_web_test_suite",
+    "tensorboard_ts_config",
+    "tensorboard_ts_declaration",
+    "tensorboard_ts_development_sources",
+    "tensorboard_ts_library",
+    "tensorboard_webcomponent_library",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+# Visibility group for all clients of projector.
+package_group(
+    name = "projector_group",
+    packages = [
+        "//apps/labs/towerbridge/...",
+        "//experimental/bigpicture/projector/...",
+        "//java/com/google/apps/labs/towerbridge/...",
+        "//learning/vis/projector/...",
+        "//tensorflow/tensorboard/...",
+    ],
+)
+
+tensorboard_ts_declaration(
+    name = "external",
+    srcs = ["external.d.ts"],
+)
+
+tensorboard_ts_library(
+    name = "ts_lib",
+    srcs = glob(
+        ["*.ts"],
+        exclude = [
+            "*.d.ts",
+            "*_test.ts",
+            "bh_tsne.ts",
+            "sptree.ts",
+        ],
+    ),
+    runtime_deps = [
+        "//third_party/javascript/d3/v3:d3",
+        "//third_party/javascript/numericjs",
+        "//third_party/javascript/threejs/r77:threejs",
+        "//third_party/javascript/threejs/r77/examples/js/controls:orbitcontrols",
+        "//third_party/javascript/weblas",
+    ],
+    deps = [
+        ":external",
+        ":tsne_ts_lib",
+        "//third_party/javascript/node_modules/typescript:es2015.promise",
+        "//third_party/javascript/typings/d3",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+        "//third_party/javascript/typings/threejs:three",
+        "//third_party/javascript/typings/webcomponents_js",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "tsne_ts_lib",
+    srcs = [
+        "bh_tsne.ts",
+        "sptree.ts",
+    ],
+)
+
+_PROJECTOR_LIB_DEPS = [
+    "//third_party/javascript/polymer/v1/iron-collapse:lib",
+    "//third_party/javascript/polymer/v1/iron-icons:lib",
+    "//third_party/javascript/polymer/v1/paper-button:lib",
+    "//third_party/javascript/polymer/v1/paper-checkbox:lib",
+    "//third_party/javascript/polymer/v1/paper-dialog:lib",
+    "//third_party/javascript/polymer/v1/paper-dialog-scrollable:lib",
+    "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
+    "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+    "//third_party/javascript/polymer/v1/paper-input:lib",
+    "//third_party/javascript/polymer/v1/paper-item:lib",
+    "//third_party/javascript/polymer/v1/paper-listbox:lib",
+    "//third_party/javascript/polymer/v1/paper-slider:lib",
+    "//third_party/javascript/polymer/v1/paper-spinner:lib",
+    "//third_party/javascript/polymer/v1/paper-toast:lib",
+    "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
+    "//third_party/javascript/polymer/v1/paper-tooltip:lib",
+    "//third_party/javascript/polymer/v1/polymer:lib",
+]
+
+_PROJECTOR_DESTDIR = "vz-projector"
+
+_PROJECTOR_LIB_TS_LIB_DEPS = [
+    ":ts_lib",
+    ":tsne_ts_lib",
+]
+
+# Standalone embedding projector demos should depend on this target. We
+# exclude the HTML file for the dashboard itself. Demos do not need that
+# HTML file. This was introduced because standalone demos as of today
+# have an additional Closure pass that uses a compilation configuration
+# stricter than that of TensorBoard.
+tensorboard_webcomponent_library(
+    name = "lib",
+    srcs = glob(
+        ["*.html"],
+        exclude = ["vz-projector-dashboard.html"],
+    ),
+    ts_lib_deps = _PROJECTOR_LIB_TS_LIB_DEPS,
+    destdir = _PROJECTOR_DESTDIR,
+    deps = _PROJECTOR_LIB_DEPS,
+)
+
+# TensorBoard, however, should depend on this target, which includes
+# the HTML file for the dashboard.
+tensorboard_webcomponent_library(
+    name = "lib_for_tensorboard",
+    srcs = glob(["*.html"]),
+    ts_lib_deps = _PROJECTOR_LIB_TS_LIB_DEPS,
+    destdir = _PROJECTOR_DESTDIR,
+    deps = _PROJECTOR_LIB_DEPS,
+)
+
+### Tests ###
+
+tensorboard_ts_library(
+    name = "ts_test",
+    testonly = 1,
+    srcs = glob(["*_test.ts"]),
+    runtime_deps = [
+        "//third_party/javascript/polymer/v1/polymer:lib_all_js",
+    ],
+    deps = [
+        ":ts_lib",
+        ":tsne_ts_lib",
+        "//third_party/javascript/typings/chai",
+        "//third_party/javascript/typings/jasmine:jasmine_without_externs",
+        "//third_party/javascript/typings/mocha",
+    ],
+)
+
+tensorboard_ts_development_sources(
+    name = "dev_sources_for_test",
+    testonly = 1,
+    runtime_deps = [
+        "//third_party/javascript/chai",
+        "//third_party/javascript/mocha",
+    ],
+    deps = [
+        ":ts_test",
+    ],
+)
+
+# To run locally, run :all_tests_local
+tensorboard_karma_web_test_suite(
+    name = "all_tests",
+    size = "medium",
+    browsers = ["//testing/web/browsers:chrome-linux"],
+    manifest = ":dev_sources_for_test",
+)
+
+# Generate a TypeScript IDE project by running this target.
+tensorboard_ts_config(
+    name = "tsconfig",
+    deps = [
+        ":ts_lib",
+        ":ts_test",
+        ":tsne_ts_lib",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/vz_projector/BUILD.OPENSOURCE b/tensorflow/tensorboard/components/vz_projector/BUILD.OPENSOURCE
deleted file mode 100644
index 8c222be10e919a047216dc906aeec59a0ef2973a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/BUILD.OPENSOURCE
+++ /dev/null
@@ -1,19 +0,0 @@
-# Description:
-# Package for the Embedding Projector component.
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
index 3857113ac0471b63cdaf31662a56228515e7501a..55c15da5ed73360b486cd65be3a05cdde68e91c5 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
@@ -37,6 +37,8 @@ limitations under the License.
   </template>
 </template>
 <script>
+"use strict";
+
 (function() {
 TF.Dashboard.VzProjectorDashboard = Polymer({
   is: 'vz-projector-dashboard',
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/BUILD b/tensorflow/tensorboard/components/vz_projector_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b388999a531b8678e926e464d558c078226efa37
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/BUILD
@@ -0,0 +1,354 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_projector_d3v4",
+    srcs = [
+        "bundle.html",
+        "bundle.js",
+        "styles.html",
+        "vz-projector.html",
+        "vz-projector-app.html",
+        "vz-projector-bookmark-panel.html",
+        "vz-projector-colab.html",
+        "vz-projector-dashboard.html",
+        "vz-projector-data-panel.html",
+        "vz-projector-input.html",
+        "vz-projector-inspector-panel.html",
+        "vz-projector-legend.html",
+        "vz-projector-metadata-card.html",
+        "vz-projector-projections-panel.html",
+    ],
+    path = "/vz-projector",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:numericjs",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:threejs",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:weblas",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_dialog_scrollable",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_listbox",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+        "@org_polymer_paper_toast",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "external.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:three.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {
+        "VZ.Projector.Heap": ["heap.ts"],
+        "VZ.Projector.Label": ["label.ts"],
+        "VZ.Projector.SPTree": ["sptree.ts"],
+        "VZ.Projector.BhTsne": ["bh_tsne.ts"],
+        "VZ.Projector.Logging": ["logging.ts"],
+        "VZ.Projector.RenderContext": ["renderContext.ts"],
+        "VZ.Projector.ScatterPlotRectangleSelector": ["scatterPlotRectangleSelector.ts"],
+        "VZ.Projector.AnalyticsLogger": ["analyticsLogger.ts"],
+        "VZ.Projector.Util": ["util.ts"],
+        "VZ.Projector.Data": ["data.ts"],
+        "VZ.Projector.DataProvider": ["data-provider.ts"],
+        "VZ.Projector.DataProviderDemo": ["data-provider-demo.ts"],
+        "VZ.Projector.DataProviderProto": ["data-provider-proto.ts"],
+        "VZ.Projector.DataProviderServer": ["data-provider-server.ts"],
+        "VZ.Projector.Knn": ["knn.ts"],
+        "VZ.Projector.ProjectorEventContext": ["projectorEventContext.ts"],
+        "VZ.Projector.ScatterPlot": ["scatterPlot.ts"],
+        "VZ.Projector.ScatterPlotVisualizer3DLabels": ["scatterPlotVisualizer3DLabels.ts"],
+        "VZ.Projector.ScatterPlotVisualizerCanvasLabels": ["scatterPlotVisualizerCanvasLabels.ts"],
+        "VZ.Projector.ScatterPlotVisualizerPolylines": ["scatterPlotVisualizerPolylines.ts"],
+        "VZ.Projector.ScatterPlotVisualizerSprites": ["scatterPlotVisualizerSprites.ts"],
+        "VZ.Projector.ScatterPlotVisualizer": ["scatterPlotVisualizer.ts"],
+        "VZ.Projector.ProjectorScatterPlotAdapter": ["projectorScatterPlotAdapter.ts"],
+        "VZ.Projector.Vector": ["vector.ts"],
+        "VZ.Projector.ProjectorUtil": ["vz-projector-util.ts"],
+        "VZ.Projector.ProjectorBookmarkPanel": ["vz-projector-bookmark-panel.ts"],
+        "VZ.Projector.ProjectorDataPanel": ["vz-projector-data-panel.ts"],
+        "VZ.Projector.ProjectorInput": ["vz-projector-input.ts"],
+        "VZ.Projector.ProjectorInspectorPanel": ["vz-projector-inspector-panel.ts"],
+        "VZ.Projector.ProjectorLegend": ["vz-projector-legend.ts"],
+        "VZ.Projector.ProjectorMetadataCard": ["vz-projector-metadata-card.ts"],
+        "VZ.Projector.ProjectorProjectionsPanel": ["vz-projector-projections-panel.ts"],
+        "VZ.Projector": ["vz-projector.ts"],
+    },
+    namespace_symbol_aliases = {
+        "VZ.Projector.AnalyticsLogger": {
+            "ProjectionType": "VZ.Projector.Data.ProjectionType",
+        },
+        "VZ.Projector.BhTsne": {
+            "SPNode": "VZ.Projector.SPTree.SPNode",
+            "SPTree": "VZ.Projector.SPTree.SPTree",
+        },
+        "VZ.Projector.DataProviderDemo": {
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "ProjectorConfig": "VZ.Projector.DataProvider.ProjectorConfig",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "TENSORS_MSG_ID": "VZ.Projector.DataProvider.TENSORS_MSG_ID",
+            "dataProvider": "VZ.Projector.DataProvider",
+            "logging": "VZ.Projector.Logging",
+        },
+        "VZ.Projector.DataProviderProto": {
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "DataProto": "VZ.Projector.Data.DataProto",
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "PointMetadata": "VZ.Projector.Data.PointMetadata",
+            "ProjectorConfig": "VZ.Projector.DataProvider.ProjectorConfig",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "analyzeMetadata": "VZ.Projector.DataProvider.analyzeMetadata",
+        },
+        "VZ.Projector.DataProviderServer": {
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "ProjectorConfig": "VZ.Projector.DataProvider.ProjectorConfig",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "dataProvider": "VZ.Projector.DataProvider",
+            "logging": "VZ.Projector.Logging",
+        },
+        "VZ.Projector.DataProvider": {
+            "ColumnStats": "VZ.Projector.Data.ColumnStats",
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "PointMetadata": "VZ.Projector.Data.PointMetadata",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "logging": "VZ.Projector.Logging",
+            "runAsyncTask": "VZ.Projector.Util.runAsyncTask",
+        },
+        "VZ.Projector.Data": {
+            "SpriteMetadata": "VZ.Projector.DataProvider.SpriteMetadata",
+            "TSNE": "VZ.Projector.BhTsne.TSNE",
+            "knn": "VZ.Projector.Knn",
+            "logging": "VZ.Projector.Logging",
+            "scatterPlot": "VZ.Projector.ScatterPlot",
+            "util": "VZ.Projector.Util",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector.Knn": {
+            "KMin": "VZ.Projector.Heap.KMin",
+            "Vector": "VZ.Projector.Vector.Vector",
+            "logging": "VZ.Projector.Logging",
+            "runAsyncTask": "VZ.Projector.Util.runAsyncTask",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector.ProjectorEventContext": {
+            "DistanceFunction": "VZ.Projector.Data.DistanceFunction",
+            "NearestEntry": "VZ.Projector.Knn.NearestEntry",
+            "Projection": "VZ.Projector.Data.Projection",
+        },
+        "VZ.Projector.ProjectorScatterPlotAdapter": {
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "DistanceFunction": "VZ.Projector.Data.DistanceFunction",
+            "LabelRenderParams": "VZ.Projector.RenderContext.LabelRenderParams",
+            "NearestEntry": "VZ.Projector.Knn.NearestEntry",
+            "Projection": "VZ.Projector.Data.Projection",
+            "ProjectionComponents3D": "VZ.Projector.Data.ProjectionComponents3D",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "ScatterPlot": "VZ.Projector.ScatterPlot.ScatterPlot",
+            "ScatterPlotVisualizer3DLabels": "VZ.Projector.ScatterPlotVisualizer3DLabels.ScatterPlotVisualizer3DLabels",
+            "ScatterPlotVisualizerCanvasLabels": "VZ.Projector.ScatterPlotVisualizerCanvasLabels.ScatterPlotVisualizerCanvasLabels",
+            "ScatterPlotVisualizerPolylines": "VZ.Projector.ScatterPlotVisualizerPolylines.ScatterPlotVisualizerPolylines",
+            "ScatterPlotVisualizerSprites": "VZ.Projector.ScatterPlotVisualizerSprites.ScatterPlotVisualizerSprites",
+            "State": "VZ.Projector.Data.State",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector.ScatterPlot": {
+            "BoundingBox": "VZ.Projector.ScatterPlotRectangleSelector.BoundingBox",
+            "CameraType": "VZ.Projector.RenderContext.CameraType",
+            "LabelRenderParams": "VZ.Projector.RenderContext.LabelRenderParams",
+            "Point2D": "VZ.Projector.Vector.Point2D",
+            "Point3D": "VZ.Projector.Vector.Point3D",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotRectangleSelector": "VZ.Projector.ScatterPlotRectangleSelector.ScatterPlotRectangleSelector",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizer3DLabels": {
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizerCanvasLabels": {
+            "BoundingBox": "VZ.Projector.Label.BoundingBox",
+            "CameraType": "VZ.Projector.RenderContext.CameraType",
+            "CollisionGrid": "VZ.Projector.Label.CollisionGrid",
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizerPolylines": {
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizerSprites": {
+            "CameraType": "VZ.Projector.RenderContext.CameraType",
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizer": {
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+        },
+        "VZ.Projector.Util": {
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "Point2D": "VZ.Projector.Vector.Point2D",
+            "logging": "VZ.Projector.Logging",
+        },
+        "VZ.Projector.Vector": {
+            "assert": "VZ.Projector.Util.assert",
+        },
+        "VZ.Projector.ProjectorBookmarkPanel": {
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projector": "VZ.Projector.Projector",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "State": "VZ.Projector.Data.State",
+            "logging": "VZ.Projector.Logging",
+        },
+        "VZ.Projector.ProjectorDataPanel": {
+            "ColorLegendRenderInfo": "VZ.Projector.ProjectorLegend.ColorLegendRenderInfo",
+            "ColorLegendThreshold": "VZ.Projector.ProjectorLegend.ColorLegendThreshold",
+            "ColorOption": "VZ.Projector.Data.ColorOption",
+            "ColumnStats": "VZ.Projector.Data.ColumnStats",
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projector": "VZ.Projector.Projector",
+            "ProjectorConfig": "VZ.Projector.DataProvider.ProjectorConfig",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "parseRawMetadata": "VZ.Projector.DataProvider.parseRawMetadata",
+            "parseRawTensors": "VZ.Projector.DataProvider.parseRawTensors",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ProjectorInput": {
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+        },
+        "VZ.Projector.ProjectorInspectorPanel": {
+            "DistanceFunction": "VZ.Projector.Data.DistanceFunction",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projector": "VZ.Projector.Projector",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "ProjectorInput": "VZ.Projector.ProjectorInput.ProjectorInput",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "adapter": "VZ.Projector.ProjectorScatterPlotAdapter",
+            "knn": "VZ.Projector.Knn",
+            "util": "VZ.Projector.Util",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector.ProjectorLegend": {
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+        },
+        "VZ.Projector.ProjectorMetadataCard": {
+            "PointMetadata": "VZ.Projector.Data.PointMetadata",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+        },
+        "VZ.Projector.ProjectorProjectionsPanel": {
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projection": "VZ.Projector.Data.Projection",
+            "ProjectionType": "VZ.Projector.Data.ProjectionType",
+            "Projector": "VZ.Projector.Projector",
+            "ProjectorInput": "VZ.Projector.ProjectorInput.ProjectorInput",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "Vector": "VZ.Projector.Vector.Vector",
+            "data": "VZ.Projector.Data",
+            "util": "VZ.Projector.Util",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector": {
+            "AnalyticsLogger": "VZ.Projector.AnalyticsLogger.AnalyticsLogger",
+            "BookmarkPanel": "VZ.Projector.ProjectorBookmarkPanel.BookmarkPanel",
+            "ColorOption": "VZ.Projector.Data.ColorOption",
+            "ColumnStats": "VZ.Projector.Data.ColumnStats",
+            "DataPanel": "VZ.Projector.ProjectorDataPanel.DataPanel",
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "DataProto": "VZ.Projector.Data.DataProto",
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "DemoDataProvider": "VZ.Projector.DataProviderDemo.DemoDataProvider",
+            "DistanceFunction": "VZ.Projector.Data.DistanceFunction",
+            "DistanceMetricChangedListener": "VZ.Projector.ProjectorEventContext.DistanceMetricChangedListener",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "HoverListener": "VZ.Projector.ProjectorEventContext.HoverListener",
+            "InspectorPanel": "VZ.Projector.ProjectorInspectorPanel.InspectorPanel",
+            "MetadataCard": "VZ.Projector.ProjectorMetadataCard.MetadataCard",
+            "MouseMode": "VZ.Projector.ScatterPlot.MouseMode",
+            "PointMetadata": "VZ.Projector.Data.PointMetadata",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projection": "VZ.Projector.Data.Projection",
+            "ProjectionChangedListener": "VZ.Projector.ProjectorEventContext.ProjectionChangedListener",
+            "ProjectionsPanel": "VZ.Projector.ProjectorProjectionsPanel.ProjectionsPanel",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "ProjectorScatterPlotAdapter": "VZ.Projector.ProjectorScatterPlotAdapter.ProjectorScatterPlotAdapter",
+            "ProtoDataProvider": "VZ.Projector.DataProviderProto.ProtoDataProvider",
+            "SelectionChangedListener": "VZ.Projector.ProjectorEventContext.SelectionChangedListener",
+            "ServerDataProvider": "VZ.Projector.DataProviderServer.ServerDataProvider",
+            "ServingMode": "VZ.Projector.DataProvider.ServingMode",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "data": "VZ.Projector.Data",
+            "knn": "VZ.Projector.Knn",
+            "logging": "VZ.Projector.Logging",
+            "stateGetAccessorDimensions": "VZ.Projector.Data.stateGetAccessorDimensions",
+            "util": "VZ.Projector.Util",
+        },
+    },
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/bh_tsne.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/bh_tsne.ts
index 9d2df65f56088b0f6a9a08d36f37f51cd96ac99b..063d57ec401d196827ce978dc64d4121a9c5edb3 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/bh_tsne.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/bh_tsne.ts
@@ -22,6 +22,7 @@ limitations under the License.
  */
 
 /**
+ * @license
  * The MIT License (MIT)
  * Copyright (c) 2015 Andrej Karpathy
  * Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/bundle.html b/tensorflow/tensorboard/components/vz_projector_d3v4/bundle.html
new file mode 100644
index 0000000000000000000000000000000000000000..2837fed870832c20310945914f41f6c5047f5f5f
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/bundle.html
@@ -0,0 +1,24 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../tf-imports/numericjs.html">
+<link rel="import" href="../tf-imports/threejs.html">
+<link rel="import" href="../tf-imports/weblas.html">
+
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/projectorScatterPlotAdapter.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/projectorScatterPlotAdapter.ts
index bb09e2b153aa7069987f0f48df3d7e7cb70327c1..9d6df953d65c57b60c88c3b744342123b61fe5bf 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/projectorScatterPlotAdapter.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/projectorScatterPlotAdapter.ts
@@ -378,8 +378,8 @@ export class ProjectorScatterPlotAdapter {
     }
 
     return new LabelRenderParams(
-        visibleLabels, labelStrings, scale, opacityFlags, LABEL_FONT_SIZE,
-        fillColors, strokeColors);
+        new Float32Array(visibleLabels), labelStrings, scale, opacityFlags,
+        LABEL_FONT_SIZE, fillColors, strokeColors);
   }
 
   generatePointScaleFactorArray(
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerSprites.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerSprites.ts
index 8adc9a9bd234cd905df8f9b26b3ea9a419b72097..be9c1703c727f11381d7836de86dad1c1c294cc0 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerSprites.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerSprites.ts
@@ -330,7 +330,7 @@ export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
 
   setSpriteAtlas(
       spriteImage: HTMLImageElement, spriteDimensions: [number, number],
-      spriteIndices: Uint8Array) {
+      spriteIndices: Float32Array) {
     this.disposeTextureAtlas();
     this.createTextureFromSpriteAtlas(
         spriteImage, spriteDimensions, spriteIndices);
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/test/BUILD b/tensorflow/tensorboard/components/vz_projector_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e76c84e8f3372ae3d06ecec7413f64e8f3732cac
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/BUILD
@@ -0,0 +1,81 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/vz-projector/test",
+    deps = [
+        "//tensorflow/tensorboard/components/vz_projector_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:three.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+        "//tensorflow/tensorboard/components/vz_projector_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {
+        "VZ.Projector.Test": [
+            "assert.ts",
+            "sptree_test.ts",
+            "data_test.ts",
+            "data-provider_test.ts",
+            "util_test.ts",
+
+            # TODO(smilkov): Migrate these away from jasmine.
+            # "scatterPlotRectangleSelector_test.ts",
+            # "vz-projector-projections-panel_test.ts",
+        ],
+    },
+    namespace_symbol_aliases = {
+        "VZ.Projector.Test": {
+            "BoundingBox": "VZ.Projector.ScatterPlotRectangleSelector.BoundingBox",
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "ProjectionsPanel": "VZ.Projector.ProjectorProjectionsPanel.ProjectionsPanel",
+            "SPTree": "VZ.Projector.SPTree.SPTree",
+            "ScatterPlotRectangleSelector": "VZ.Projector.ScatterPlotRectangleSelector.ScatterPlotRectangleSelector",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "State": "VZ.Projector.Data.State",
+            "data_provider": "VZ.Projector.DataProvider",
+            "stateGetAccessorDimensions": "VZ.Projector.Data.stateGetAccessorDimensions",
+            "util": "VZ.Projector.Util",
+        },
+    },
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/test/assert.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/assert.ts
new file mode 100644
index 0000000000000000000000000000000000000000..f489517a7f23f36ecb91875638e464e3c7312926
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/assert.ts
@@ -0,0 +1,16 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+const assert = chai.assert;
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/data-provider_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/data-provider_test.ts
similarity index 53%
rename from tensorflow/tensorboard/components/vz_projector_d3v4/data-provider_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/data-provider_test.ts
index 01b89ca700169c763845a8c1bf41706c3d08c6bb..59a42ffbfd84d7a6731af504081b7f0c64d17592 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/data-provider_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/data-provider_test.ts
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {DataPoint, SpriteAndMetadataInfo} from './data';
-import * as data_provider from './data-provider';
+import {DataPoint, SpriteAndMetadataInfo} from '../data';
+import * as data_provider from '../data-provider';
 
 /**
  * Converts a string to an ArrayBuffer.
@@ -48,15 +48,15 @@ describe('parse tensors', () => {
         .then((tensorsArrayBuffer: ArrayBuffer) => {
           data_provider.parseTensors(tensorsArrayBuffer)
               .then((data: DataPoint[]) => {
-                expect(data.length).toBe(2);
+                assert.equal(2, data.length);
 
-                expect(data[0].vector).toEqual(new Float32Array(tensors[0]));
-                expect(data[0].index).toEqual(0);
-                expect(data[0].projections).toBeNull();
+                assert.deepEqual(new Float32Array(tensors[0]), data[0].vector);
+                assert.equal(0, data[0].index);
+                assert.isNull(data[0].projections);
 
-                expect(data[1].vector).toEqual(new Float32Array(tensors[1]));
-                expect(data[1].index).toEqual(1);
-                expect(data[1].projections).toBeNull();
+                assert.deepEqual(new Float32Array(tensors[1]), data[1].vector);
+                assert.equal(1, data[1].index);
+                assert.isNull(data[1].projections);
                 doneFn();
               });
         });
@@ -68,27 +68,27 @@ describe('parse tensors', () => {
         .then((metadataArrayBuffer: ArrayBuffer) => {
           data_provider.parseMetadata(metadataArrayBuffer)
               .then((spriteAndMetadataInfo: SpriteAndMetadataInfo) => {
-                expect(spriteAndMetadataInfo.stats.length).toBe(2);
-                expect(spriteAndMetadataInfo.stats[0].name)
-                    .toBe(metadata[0][0]);
-                expect(spriteAndMetadataInfo.stats[0].isNumeric).toBe(false);
-                expect(spriteAndMetadataInfo.stats[0].tooManyUniqueValues)
-                    .toBe(false);
-                expect(spriteAndMetadataInfo.stats[1].name)
-                    .toBe(metadata[0][1]);
-                expect(spriteAndMetadataInfo.stats[1].isNumeric).toBe(true);
-                expect(spriteAndMetadataInfo.stats[1].tooManyUniqueValues)
-                    .toBe(false);
+                assert.equal(2, spriteAndMetadataInfo.stats.length);
+                assert.equal(metadata[0][0],
+                             spriteAndMetadataInfo.stats[0].name);
+                assert.isFalse(spriteAndMetadataInfo.stats[0].isNumeric);
+                assert.isFalse(
+                    spriteAndMetadataInfo.stats[0].tooManyUniqueValues);
+                assert.equal(metadata[0][1],
+                             spriteAndMetadataInfo.stats[1].name);
+                assert.isTrue(spriteAndMetadataInfo.stats[1].isNumeric);
+                assert.isFalse(
+                    spriteAndMetadataInfo.stats[1].tooManyUniqueValues);
 
-                expect(spriteAndMetadataInfo.pointsInfo.length).toBe(2);
-                expect(spriteAndMetadataInfo.pointsInfo[0]['label'])
-                    .toBe(metadata[1][0]);
-                expect(spriteAndMetadataInfo.pointsInfo[0]['fakecol'])
-                    .toBe(+metadata[1][1]);
-                expect(spriteAndMetadataInfo.pointsInfo[1]['label'])
-                    .toBe(metadata[2][0]);
-                expect(spriteAndMetadataInfo.pointsInfo[1]['fakecol'])
-                    .toBe(+metadata[2][1]);
+                assert.equal(2, spriteAndMetadataInfo.pointsInfo.length);
+                assert.equal(metadata[1][0],
+                             spriteAndMetadataInfo.pointsInfo[0]['label']);
+                assert.equal(+metadata[1][1],
+                             spriteAndMetadataInfo.pointsInfo[0]['fakecol']);
+                assert.equal(metadata[2][0],
+                             spriteAndMetadataInfo.pointsInfo[1]['label']);
+                assert.equal(+metadata[2][1],
+                             spriteAndMetadataInfo.pointsInfo[1]['fakecol']);
                 doneFn();
               });
         });
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/data_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/data_test.ts
similarity index 80%
rename from tensorflow/tensorboard/components/vz_projector_d3v4/data_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/data_test.ts
index 924ae3a929f568efd69feb3af6bf104660a24969..5e47c091c5b5565ed084612b178201ee5ba19386 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/data_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/data_test.ts
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {DataPoint, DataSet, State, stateGetAccessorDimensions} from './data';
+import {DataPoint, DataSet, State, stateGetAccessorDimensions} from '../data';
 
 /**
  * Helper method that makes a list of points given an array of
@@ -44,8 +44,8 @@ describe('constructor_with_sequences', () => {
     // one sequence 0->2->3.
     const points = makePointsWithSequences([2, -1, 3, -1]);
     let dataset = new DataSet(points);
-    expect(dataset.sequences.length).toEqual(1);
-    expect(dataset.sequences[0].pointIndices).toEqual([0, 2, 3]);
+    assert.equal(1, dataset.sequences.length);
+    assert.deepEqual([0, 2, 3], dataset.sequences[0].pointIndices);
   });
 
   it('Simple forward pointing sequences, __next__ metadata format', () => {
@@ -53,14 +53,14 @@ describe('constructor_with_sequences', () => {
     // one sequence 0->2->3.
     const points = makePointsWithSequences([2, -1, 3, -1], '__next__');
     let dataset = new DataSet(points);
-    expect(dataset.sequences.length).toEqual(1);
-    expect(dataset.sequences[0].pointIndices).toEqual([0, 2, 3]);
+    assert.equal(1, dataset.sequences.length);
+    assert.deepEqual([0, 2, 3], dataset.sequences[0].pointIndices);
   });
 
   it('No sequences', () => {
     let points = makePointsWithSequences([-1, -1, -1, -1]);
     let dataset = new DataSet(points);
-    expect(dataset.sequences.length).toEqual(0);
+    assert.equal(0, dataset.sequences.length);
   });
 
   it('A sequence that goes backwards and forward in the array', () => {
@@ -68,8 +68,8 @@ describe('constructor_with_sequences', () => {
     // one sequence 3->1->0->2.
     let points = makePointsWithSequences([2, 0, -1, 1]);
     let dataset = new DataSet(points);
-    expect(dataset.sequences.length).toEqual(1);
-    expect(dataset.sequences[0].pointIndices).toEqual([3, 1, 0, 2]);
+    assert.equal(1, dataset.sequences.length);
+    assert.deepEqual([3, 1, 0, 2], dataset.sequences[0].pointIndices);
   });
 });
 
@@ -78,27 +78,27 @@ describe('stateGetAccessorDimensions', () => {
     const state = new State();
     state.selectedProjection = 'tsne';
     state.tSNEis3d = false;
-    expect(stateGetAccessorDimensions(state)).toEqual([0, 1]);
+    assert.deepEqual([0, 1], stateGetAccessorDimensions(state));
   });
 
   it('returns [0, 1, 2] for 3d t-SNE', () => {
     const state = new State();
     state.selectedProjection = 'tsne';
     state.tSNEis3d = true;
-    expect(stateGetAccessorDimensions(state)).toEqual([0, 1, 2]);
+    assert.deepEqual([0, 1, 2], stateGetAccessorDimensions(state));
   });
 
   it('returns pca component dimensions array for pca', () => {
     const state = new State();
     state.selectedProjection = 'pca';
     state.pcaComponentDimensions = [13, 12, 11, 10];
-    expect(stateGetAccessorDimensions(state))
-        .toEqual(state.pcaComponentDimensions);
+    assert.deepEqual(state.pcaComponentDimensions,
+                     stateGetAccessorDimensions(state));
   });
 
   it('returns ["x", "y"] for custom projections', () => {
     const state = new State();
     state.selectedProjection = 'custom';
-    expect(stateGetAccessorDimensions(state)).toEqual(['x', 'y']);
+    assert.deepEqual(['x', 'y'], stateGetAccessorDimensions(state));
   });
 });
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotRectangleSelector_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/scatterPlotRectangleSelector_test.ts
similarity index 96%
rename from tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotRectangleSelector_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/scatterPlotRectangleSelector_test.ts
index 91cb10a97ebfdfcad3ec30b0869acc7b77ab70d2..0ee6cf620df8bb082adf424a66548b832346597d 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotRectangleSelector_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/scatterPlotRectangleSelector_test.ts
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {BoundingBox, ScatterPlotRectangleSelector} from './scatterPlotRectangleSelector';
+import {BoundingBox, ScatterPlotRectangleSelector} from '../scatterPlotRectangleSelector';
 
 describe('selector callbacks make bounding box start bottom left', () => {
   let containerElement: HTMLElement;
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/sptree_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/sptree_test.ts
similarity index 97%
rename from tensorflow/tensorboard/components/vz_projector_d3v4/sptree_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/sptree_test.ts
index 440680bdf1eb4a0d2a478e3480772686ab875af0..7e340ea62f5d1146e11b8321f4668dc97d14e0c8 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/sptree_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/sptree_test.ts
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {SPTree} from './sptree';
-
-const assert = chai.assert;
+import {SPTree} from '../sptree';
 
 it('simple 2D data', () => {
   let data = [
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/test/tests.html b/tensorflow/tensorboard/components/vz_projector_d3v4/test/tests.html
new file mode 100644
index 0000000000000000000000000000000000000000..dd43079bde1f827a42893f2166d8c95645d93f99
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/tests.html
@@ -0,0 +1,24 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../web-component-tester/browser.js"></script>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../bundle.html">
+<body>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/util_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/util_test.ts
similarity index 79%
rename from tensorflow/tensorboard/components/vz_projector_d3v4/util_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/util_test.ts
index f7c0027c81bf307bfa4f08148e02e30fe4734046..c18db95eed706a3eacd09486fca2b67b5e01f595 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/util_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/util_test.ts
@@ -12,31 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-import * as util from './util';
+import * as util from '../util';
 
 describe('getURLParams', () => {
   it('search query with valid param returns correct object', () => {
     let urlParams = util.getURLParams('?config=http://google.com/');
-    expect(urlParams).toEqual({'config': 'http://google.com/'});
+    assert.deepEqual({'config': 'http://google.com/'}, urlParams);
   });
 
   it('search query with multiple valid params returns correct object', () => {
     let urlParams = util.getURLParams('?config=http://google.com/&foo=bar');
-    expect(urlParams).toEqual({'config': 'http://google.com/', 'foo': 'bar'});
+    assert.deepEqual({'config': 'http://google.com/', 'foo': 'bar'}, urlParams);
   });
 
   it('search query with valid param with URL encoded characters', () => {
     let urlParams = util.getURLParams('?config=http://google.com/%20search');
-    expect(urlParams).toEqual({'config': 'http://google.com/ search'});
+    assert.deepEqual({'config': 'http://google.com/ search'}, urlParams);
   });
 
   it('search query with pound sign', () => {
     let urlParams = util.getURLParams('?config=http://google.com/#foo');
-    expect(urlParams).toEqual({'config': 'http://google.com/'});
+    assert.deepEqual({'config': 'http://google.com/'}, urlParams);
   });
 
   it('no search query returns empty object', () => {
     let urlParams = util.getURLParams('');
-    expect(urlParams).toEqual({});
+    assert.deepEqual({}, urlParams);
   });
 });
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-projections-panel_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/vz-projector-projections-panel_test.ts
similarity index 96%
rename from tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-projections-panel_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/vz-projector-projections-panel_test.ts
index fd1acf6f085e1c5bd3597933589338ce80f1e4e9..2bf0c6eb48f019e2467d7c9451748696bb6ed54d 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-projections-panel_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/vz-projector-projections-panel_test.ts
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-import {State} from './data';
-import {ProjectionsPanel} from './vz-projector-projections-panel';
-
-const assert = chai.assert;
+import {State} from '../data';
+import {ProjectionsPanel} from '../vz-projector-projections-panel';
 
 describe('restoreUIFromBookmark', () => {
   let projectionsPanel: ProjectionsPanel;
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-dashboard.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-dashboard.html
index 3857113ac0471b63cdaf31662a56228515e7501a..55c15da5ed73360b486cd65be3a05cdde68e91c5 100644
--- a/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-dashboard.html
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-dashboard.html
@@ -37,6 +37,8 @@ limitations under the License.
   </template>
 </template>
 <script>
+"use strict";
+
 (function() {
 TF.Dashboard.VzProjectorDashboard = Polymer({
   is: 'vz-projector-dashboard',
diff --git a/tensorflow/tensorboard/components/vz_sorting/BUILD b/tensorflow/tensorboard/components/vz_sorting/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ab5d7c398956700a9d1bab76a4d76da56dea3d1b
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting/BUILD
@@ -0,0 +1,50 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_sorting",
+    srcs = [
+        "vz-sorting.html",
+        ":ts",
+    ],
+    path = "/vz-sorting",
+    visibility = ["//visibility:public"],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["sorting.ts"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "vz-sorting.html",
+        ":legacy_ts",
+    ],
+    visibility = ["//visibility:public"],
+    destdir = "vz-sorting",
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = ["sorting.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = ["//tensorflow/tensorboard/components:common_deps"],
+)
diff --git a/tensorflow/tensorboard/components/vz_sorting/test/BUILD b/tensorflow/tensorboard/components/vz_sorting/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..d1cf5a596ad2ef6846393a6bf223a897320af39a
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting/test/BUILD
@@ -0,0 +1,40 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_wct_test_suite")
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_wct_test_suite(
+    name = "legacy_test",
+    size = "medium",
+    srcs = [
+        "index.html",
+        ":legacy_ts",
+    ],
+    deps = [
+        "//tensorflow/tensorboard/components/vz_sorting:legacy",
+        "//third_party/javascript/polymer/v1/webcomponentsjs:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    testonly = 1,
+    srcs = ["sortingTests.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = [
+        "//tensorflow/tensorboard/components:common_deps",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy_ts",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/BUILD b/tensorflow/tensorboard/components/vz_sorting_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..189a6a69ac7b12fdccdc6bb47d9ab322054d25ce
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/BUILD
@@ -0,0 +1,34 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_sorting_d3v4",
+    srcs = [
+        "bundle.js",
+        "vz-sorting.html",
+    ],
+    path = "/vz-sorting",
+    visibility = ["//visibility:public"],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"VZ.Sorting": ["sorting.ts"]},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/test/BUILD b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4993cf841dcc9a510e6b16f4942d31da3f5903e1
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/BUILD
@@ -0,0 +1,46 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/vz-sorting/test",
+    deps = [
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:mocha.d.ts",
+        "@org_definitelytyped//:chai.d.ts",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"VZ.Sorting": ["sortingTests.ts"]},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/sortingTests.ts b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/sortingTests.ts
similarity index 97%
rename from tensorflow/tensorboard/components/vz_sorting_d3v4/sortingTests.ts
rename to tensorflow/tensorboard/components/vz_sorting_d3v4/test/sortingTests.ts
index 4cfbfeb4ea10a92112b912baa42a3760582414a5..510685cb4b5e42ca19e56acef6b1f87347811c99 100644
--- a/tensorflow/tensorboard/components/vz_sorting_d3v4/sortingTests.ts
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/sortingTests.ts
@@ -13,12 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {compareTagNames} from './sorting';
-
-const assert = chai.assert;
+import {compareTagNames} from '../sorting';
 
 describe('compareTagNames', () => {
 
+  const assert = chai.assert;
   const sortTagNames = (a) => a.sort(compareTagNames);
 
   it('is asciibetical', () => {
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/tests.html b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/tests.html
similarity index 90%
rename from tensorflow/tensorboard/components/vz_sorting_d3v4/tests.html
rename to tensorflow/tensorboard/components/vz_sorting_d3v4/test/tests.html
index 7148bfb4181ac40ef261397c8484286e35325ac2..d1b4a1db31ccaa1dfbc0838cbe79709b5f1cbedd 100644
--- a/tensorflow/tensorboard/components/vz_sorting_d3v4/tests.html
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/tests.html
@@ -19,5 +19,5 @@ limitations under the License.
 <meta charset="utf-8">
 <script src="../../web-component-tester/browser.js"></script>
 <body>
-<script src="../sorting.js"></script>
-<script src="sortingTests.js"></script>
+<script src="../bundle.js"></script>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/vz-sorting.html b/tensorflow/tensorboard/components/vz_sorting_d3v4/vz-sorting.html
new file mode 100644
index 0000000000000000000000000000000000000000..9f925951cb2db13638dd8a9df8c4e9adb8fda5f2
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/vz-sorting.html
@@ -0,0 +1,18 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/defs.bzl b/tensorflow/tensorboard/defs.bzl
index 7bb5f961c97818d686b32f83382bfa1b6c36f96a..5d88baa5be31d7d2b87c688756d1921c158db1f2 100644
--- a/tensorflow/tensorboard/defs.bzl
+++ b/tensorflow/tensorboard/defs.bzl
@@ -60,6 +60,26 @@ def tensorboard_typescript_genrule(name, srcs, typings=[], **kwargs):
       **kwargs
   )
 
+def tensorboard_karma_web_test_suite(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
+def tensorboard_ts_config(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
+def tensorboard_ts_declaration(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
+def tensorboard_ts_development_sources(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
+def tensorboard_ts_devserver(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
 def tensorboard_ts_library(**kwargs):
   """Rules referencing this will be deleted from the codebase soon."""
   pass
diff --git a/tensorflow/tensorboard/demo/BUILD b/tensorflow/tensorboard/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..66065af0441299dae274996deab9e004c104df26
--- /dev/null
+++ b/tensorflow/tensorboard/demo/BUILD
@@ -0,0 +1,20 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# THIS PACKAGE HAS MOVED
+# See tensorflow/tensorboard/components/tf_tensorboard_d3v4:demo
+
+web_library(
+    name = "demo_data",
+    srcs = glob(["data/**"]),
+    path = "/",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/demo/data/logdir b/tensorflow/tensorboard/demo/data/logdir
new file mode 100644
index 0000000000000000000000000000000000000000..b6362b45d777266d6204b23884222a080f789f71
--- /dev/null
+++ b/tensorflow/tensorboard/demo/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/index.html b/tensorflow/tensorboard/demo/index.html
deleted file mode 100644
index 581f8a27235ba8b67bf95f0e9afac9d3abe4b20e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/index.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!-- This demo index file serves statically serialized TensorBoard json.
-It is essentially a mocked version of the TensorBoard backend. -->
-<html>
-<head>
-  <script src="../components/webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../components/tf_tensorboard/tf-tensorboard.html">
-    <link rel="stylesheet" type="text/css" href="../lib/css/global.css">
-    <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
-</head>
-<body>
-  <tf-tensorboard demo-dir="data/" use-hash></tf-tensorboard>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/hacks.bzl b/tensorflow/tensorboard/hacks.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..f1d4be790612ac912dc1b1a2298f8bc8dd99dee6
--- /dev/null
+++ b/tensorflow/tensorboard/hacks.bzl
@@ -0,0 +1,80 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO(jart): Merge this file into defs.bzl once that file is sync unified.
+
+def tensorboard_typescript_bundle(
+    name,
+    out,
+    namespace_srcs,
+    namespace_symbol_aliases={},
+    namespace_symbol_aliases_public={},
+    **kwargs):
+  """Rolls TypeScript ES6 modules into one vanilla source file without imports.
+
+  This is a genrule wrapper that concatenates TypeScripts sources inside
+  namespace blocks while removing ^import lines. Because the sources themselves
+  are not parsed, the structure of the modules must be passed to this macro as
+  a Skylark data structure.
+
+  Args:
+    name: Name of this build rule target.
+    out: Path of outputted TypeScript source file.
+    namespace_srcs: Multimap of namespace strings to build file targets. The
+        ordering of the dictionary and nested lists does not matter when
+        generating a typings file, but *does* matter when generating a source
+        file.
+    namespace_symbol_aliases: Map of namespace strings where each value is a
+        map of symbol names to fully qualified symbol names.
+    namespace_symbol_aliases_public: Same as namespace_symbol_aliases but the
+        symbol will be visible to other namespaces.
+  """
+  cmd = ["(", "echo // GENERATED BY TENSORBOARD_TYPESCRIPT_BUNDLE"]
+  inputs = set()
+  for namespace, srcs in namespace_srcs.items():
+    cmd.append("echo")
+    if out[-5:] == ".d.ts":
+      cmd.append("echo 'declare namespace %s {'" % namespace)
+    elif out[-3:] == ".ts":
+      cmd.append("echo 'module %s {'" % namespace)
+    else:
+      fail("'out' must end with .ts or .d.ts: " + out)
+    for symbol, canon in namespace_symbol_aliases.get(namespace, {}).items():
+      cmd.append("echo 'import %s = %s;'" % (symbol, canon))
+    for symbol, canon in namespace_symbol_aliases_public.get(namespace,
+                                                             {}).items():
+      cmd.append("echo 'export import %s = %s;'" % (symbol, canon))
+    inputs += srcs
+    for src in srcs:
+      cmd.append("for f in $(locations %s); do" % src)
+      cmd.append("  echo")
+      cmd.append("  echo /////////////////////////////////////////////////////")
+      cmd.append("  echo // " + namespace)
+      cmd.append("  echo // $$f")
+      cmd.append("  echo /////////////////////////////////////////////////////")
+      cmd.append("  echo")
+      cmd.append("  sed 's!^import !// import !' $$f \\")
+      cmd.append("    | sed 's!^export declare !export !' \\")
+      cmd.append("    | sed '/^export .* from /d' \\")
+      cmd.append("    | sed '/^export {.*};$$/d'")
+      cmd.append("done")
+    cmd.append("echo '}'")
+  cmd.append(") >$@")
+  native.genrule(
+      name = name,
+      srcs = list(inputs),
+      outs = [out],
+      cmd = "\n".join(cmd),
+      **kwargs
+  )
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..07fc3a70a704eef0c0a0d6ee24445c5dbb23786b
--- /dev/null
+++ b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+java_binary(
+    name = "Vulcanize",
+    srcs = ["Vulcanize.java"],
+    deps = [
+        "@com_google_guava",
+        "@com_google_protobuf_java",
+        "@io_bazel_rules_closure//closure/compiler",
+        "@io_bazel_rules_closure//java/io/bazel/rules/closure:webpath",
+        "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles:build_info_java_proto",
+        "@io_bazel_rules_closure//java/org/jsoup/nodes",
+        "@org_jsoup",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
new file mode 100644
index 0000000000000000000000000000000000000000..e572415856cd7151d04aa2cbd1b8c49678782acd
--- /dev/null
+++ b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
@@ -0,0 +1,317 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.tensorflow.tensorboard.vulcanize;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Verify.verifyNotNull;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+import com.google.javascript.jscomp.BasicErrorManager;
+import com.google.javascript.jscomp.CheckLevel;
+import com.google.javascript.jscomp.Compiler;
+import com.google.javascript.jscomp.CompilerOptions;
+import com.google.javascript.jscomp.CompilerOptions.LanguageMode;
+import com.google.javascript.jscomp.CompilerOptions.Reach;
+import com.google.javascript.jscomp.JSError;
+import com.google.javascript.jscomp.PropertyRenamingPolicy;
+import com.google.javascript.jscomp.SourceFile;
+import com.google.javascript.jscomp.VariableRenamingPolicy;
+import com.google.protobuf.TextFormat;
+import io.bazel.rules.closure.Webpath;
+import io.bazel.rules.closure.webfiles.BuildInfo.Webfiles;
+import io.bazel.rules.closure.webfiles.BuildInfo.WebfilesSource;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Comment;
+import org.jsoup.nodes.DataNode;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Html5Printer;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.Tag;
+
+/** Simple one-off solution for TensorBoard vulcanization. */
+public final class Vulcanize {
+
+  private static final Parser parser = Parser.htmlParser();
+  private static final Map<Webpath, Path> webfiles = new HashMap<>();
+  private static final Set<Webpath> alreadyInlined = new HashSet<>();
+  private static final Set<String> legalese = new HashSet<>();
+  private static final List<String> licenses = new ArrayList<>();
+  private static final List<Webpath> stack = new ArrayList<>();
+  private static Webpath outputPath;
+  private static Node licenseComment;
+  private static boolean nominify;
+
+  public static void main(String[] args) throws IOException {
+    Webpath inputPath = Webpath.get(args[0]);
+    outputPath = Webpath.get(args[1]);
+    Path output = Paths.get(args[2]);
+    for (int i = 3; i < args.length; i++) {
+      Webfiles manifest = loadWebfilesPbtxt(Paths.get(args[i]));
+      for (WebfilesSource src : manifest.getSrcList()) {
+        webfiles.put(Webpath.get(src.getWebpath()), Paths.get(src.getPath()));
+      }
+    }
+    stack.add(inputPath);
+    Document document = parse(Files.readAllBytes(webfiles.get(inputPath)));
+    transform(document);
+    if (licenseComment != null) {
+      licenseComment.attr("comment", String.format("\n%s\n", Joiner.on("\n\n").join(licenses)));
+    }
+    Files.write(
+        output,
+        Html5Printer.stringify(document).getBytes(UTF_8),
+        StandardOpenOption.WRITE,
+        StandardOpenOption.CREATE,
+        StandardOpenOption.TRUNCATE_EXISTING);
+  }
+
+  private static void transform(Node root) throws IOException {
+    Node node = checkNotNull(root);
+    Node newNode;
+    while (true) {
+      newNode = enterNode(node);
+      if (node.equals(root)) {
+        root = newNode;
+      }
+      node = newNode;
+      if (node.childNodeSize() > 0) {
+        node = node.childNode(0);
+      } else {
+        while (true) {
+          newNode = leaveNode(node);
+          if (node.equals(root)) {
+            root = newNode;
+          }
+          node = newNode;
+          if (node.equals(root)) {
+            return;
+          }
+          Node next = node.nextSibling();
+          if (next == null) {
+            if (node.parentNode() == null) {
+              return;
+            }
+            node = verifyNotNull(node.parentNode(), "unexpected root: %s", node);
+          } else {
+            node = next;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  private static Node enterNode(Node node) throws IOException {
+    Node newNode = node;
+    if (node instanceof Element) {
+      if (node.nodeName().equals("link") && node.attr("rel").equals("import")) {
+        // Inline HTML.
+        Webpath href = me().lookup(Webpath.get(node.attr("href")));
+        if (alreadyInlined.add(href)) {
+          newNode =
+              parse(Files.readAllBytes(checkNotNull(webfiles.get(href), "%s in %s", href, me())));
+          stack.add(href);
+          node.replaceWith(newNode);
+        } else {
+          newNode = new TextNode("", node.baseUri());
+          node.replaceWith(newNode);
+        }
+      } else if (node.nodeName().equals("script")) {
+        nominify = node.hasAttr("nominify");
+        node.removeAttr("nominify");
+        Webpath src;
+        String script;
+        if (node.attr("src").isEmpty()) {
+          // Minify JavaScript.
+          StringBuilder sb = new StringBuilder();
+          for (Node child : node.childNodes()) {
+            if (child instanceof DataNode) {
+              sb.append(((DataNode) child).getWholeData());
+            }
+          }
+          src = me();
+          script = sb.toString();
+        } else {
+          // Inline JavaScript.
+          src = me().lookup(Webpath.get(node.attr("src")));
+          Path other = webfiles.get(src);
+          if (other != null) {
+            script = new String(Files.readAllBytes(other), UTF_8);
+            node.removeAttr("src");
+          } else {
+            src = me();
+            script = "";
+          }
+        }
+        script = minify(src, script);
+        newNode =
+            new Element(Tag.valueOf("script"), node.baseUri(), node.attributes())
+                .appendChild(new DataNode(script, node.baseUri()));
+        node.replaceWith(newNode);
+      } else if (node.nodeName().equals("link")
+          && node.attr("rel").equals("stylesheet")
+          && !node.attr("href").isEmpty()) {
+        // Inline CSS.
+        Webpath href = me().lookup(Webpath.get(node.attr("href")));
+        Path other = webfiles.get(href);
+        if (other != null) {
+          newNode =
+              new Element(Tag.valueOf("style"), node.baseUri(), node.attributes())
+                  .appendChild(
+                      new DataNode(new String(Files.readAllBytes(other), UTF_8), node.baseUri()));
+          newNode.removeAttr("rel");
+          newNode.removeAttr("href");
+          node.replaceWith(newNode);
+        }
+      }
+      rootifyAttribute(newNode, "href");
+      rootifyAttribute(newNode, "src");
+      rootifyAttribute(newNode, "action");
+      rootifyAttribute(newNode, "assetpath");
+    } else if (node instanceof Comment) {
+      String text = ((Comment) node).getData();
+      if (text.contains("@license")) {
+        handleLicense(text);
+        if (licenseComment == null) {
+          licenseComment = node;
+        } else {
+          newNode = new TextNode("", node.baseUri());
+          node.replaceWith(newNode);
+        }
+      } else {
+        newNode = new TextNode("", node.baseUri());
+        node.replaceWith(newNode);
+      }
+    }
+    return newNode;
+  }
+
+  private static String minify(Webpath src, String script) {
+    if (nominify) {
+      return script;
+    }
+    Compiler compiler = new Compiler(new JsPrintlessErrorManager());
+    CompilerOptions options = new CompilerOptions();
+    options.skipAllCompilerPasses(); // too lazy to get externs
+    options.setLanguageIn(LanguageMode.ECMASCRIPT_2016);
+    options.setLanguageOut(LanguageMode.ECMASCRIPT5);
+    options.setContinueAfterErrors(true);
+    options.setManageClosureDependencies(false);
+    options.setRenamingPolicy(VariableRenamingPolicy.LOCAL, PropertyRenamingPolicy.OFF);
+    options.setShadowVariables(true);
+    options.setInlineVariables(Reach.LOCAL_ONLY);
+    options.setFlowSensitiveInlineVariables(true);
+    options.setInlineFunctions(Reach.LOCAL_ONLY);
+    options.setAssumeClosuresOnlyCaptureReferences(false);
+    options.setCheckGlobalThisLevel(CheckLevel.OFF);
+    options.setFoldConstants(true);
+    options.setCoalesceVariableNames(true);
+    options.setDeadAssignmentElimination(true);
+    options.setCollapseVariableDeclarations(true);
+    options.setConvertToDottedProperties(true);
+    options.setLabelRenaming(true);
+    options.setRemoveDeadCode(true);
+    options.setOptimizeArgumentsArray(true);
+    options.setRemoveUnusedVariables(Reach.LOCAL_ONLY);
+    options.setCollapseObjectLiterals(true);
+    options.setProtectHiddenSideEffects(true);
+    //options.setPrettyPrint(true);
+    compiler.disableThreads();
+    compiler.compile(
+        ImmutableList.<SourceFile>of(),
+        ImmutableList.of(SourceFile.fromCode(src.toString(), script)),
+        options);
+    return compiler.toSource();
+  }
+
+  private static void handleLicense(String text) {
+    if (legalese.add(CharMatcher.whitespace().removeFrom(text))) {
+      licenses.add(CharMatcher.anyOf("\r\n").trimFrom(text));
+    }
+  }
+
+  private static Node leaveNode(Node node) {
+    if (node instanceof Document) {
+      stack.remove(stack.size() - 1);
+    }
+    return node;
+  }
+
+  private static Webpath me() {
+    return Iterables.getLast(stack);
+  }
+
+  private static void rootifyAttribute(Node node, String attribute) {
+    String value = node.attr(attribute);
+    if (value.isEmpty()) {
+      return;
+    }
+    Webpath uri = Webpath.get(value);
+    if (webfiles.containsKey(uri)) {
+      node.attr(attribute, outputPath.getParent().relativize(uri).toString());
+    }
+  }
+
+  private static Document parse(byte[] bytes) {
+    return parse(new ByteArrayInputStream(bytes));
+  }
+
+  private static Document parse(InputStream input) {
+    Document document;
+    try {
+      document = Jsoup.parse(input, null, "", parser);
+    } catch (IOException e) {
+      throw new AssertionError("I/O error when parsing byte array D:", e);
+    }
+    document.outputSettings().indentAmount(0);
+    document.outputSettings().prettyPrint(false);
+    return document;
+  }
+
+  private static Webfiles loadWebfilesPbtxt(Path path) throws IOException {
+    Webfiles.Builder build = Webfiles.newBuilder();
+    TextFormat.getParser().merge(new String(Files.readAllBytes(path), UTF_8), build);
+    return build.build();
+  }
+
+  private static final class JsPrintlessErrorManager extends BasicErrorManager {
+
+    @Override
+    public void println(CheckLevel level, JSError error) {}
+
+    @Override
+    public void printSummary() {}
+  }
+}
diff --git a/tensorflow/tensorboard/package.json b/tensorflow/tensorboard/package.json
index 69f08495a305cca8bf3e1dc56d34e3c16b72f3fb..d424f103dd76be4b74dc035316f0637a8fdfbaab 100644
--- a/tensorflow/tensorboard/package.json
+++ b/tensorflow/tensorboard/package.json
@@ -30,7 +30,7 @@
     "merge2": "~0.3.6",
     "minimist": "~1.2.0",
     "tsify": "^0.14.8",
-    "typescript": "2.2.2",
+    "typescript": "2.3.1",
     "typings": "1.4.0",
     "vinyl-source-stream": "^1.1.0",
     "vulcanize": "^1.14.0",
diff --git a/tensorflow/tensorboard/plugins/text/BUILD b/tensorflow/tensorboard/plugins/text/BUILD
index 3e1455ea5a4848e932a4f8f3e6d25695955c7d14..f6ed41375f78c925c52667546e67d56c2ee0e28a 100644
--- a/tensorflow/tensorboard/plugins/text/BUILD
+++ b/tensorflow/tensorboard/plugins/text/BUILD
@@ -24,6 +24,7 @@ py_library(
         "@org_mozilla_bleach",
         "@org_pocoo_werkzeug//:werkzeug",
         "@org_pythonhosted_markdown",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin.py b/tensorflow/tensorboard/plugins/text/text_plugin.py
index 427a761d1eff9b7e4bed952db6d65fd4498e0348..280f77a2ae566fc7f5f9b0804af5c610b8440305 100644
--- a/tensorflow/tensorboard/plugins/text/text_plugin.py
+++ b/tensorflow/tensorboard/plugins/text/text_plugin.py
@@ -30,6 +30,7 @@ import bleach
 # pylint: disable=g-bad-import-order
 # Google-only: import markdown_freewisdom
 import markdown
+import six
 # pylint: enable=g-bad-import-order
 from werkzeug import wrappers
 
@@ -95,8 +96,8 @@ def markdown_and_sanitize(markdown_string):
   Returns:
     a string containing sanitized html for input markdown
   """
-  # Convert to utf-8 because we get a bytearray in python3
-  if not isinstance(markdown_string, str):
+  # Convert to utf-8 whenever we have a binary input.
+  if isinstance(markdown_string, six.binary_type):
     markdown_string = markdown_string.decode('utf-8')
 
   string_html = markdown.markdown(
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin_test.py b/tensorflow/tensorboard/plugins/text/text_plugin_test.py
index 91dca289ce1a14ed2cc08e2eb6c759739a662bc0..a7f0235889953e595a7ff44ff99fb2e62fe16c93 100644
--- a/tensorflow/tensorboard/plugins/text/text_plugin_test.py
+++ b/tensorflow/tensorboard/plugins/text/text_plugin_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -404,6 +405,10 @@ class TextPluginTest(test.TestCase):
     # The plugin is active because text summaries are available.
     self.assertTrue(self.plugin.is_active())
 
+  def testUnicode(self):
+    self.assertConverted(u'<p>Iñtërnâtiônàlizætiøn⚡💩</p>',
+                         'Iñtërnâtiônàlizætiøn⚡💩')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tensorboard/vulcanize.bzl b/tensorflow/tensorboard/vulcanize.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..f7d88047afca8556642fa24bed710c79a1285fd3
--- /dev/null
+++ b/tensorflow/tensorboard/vulcanize.bzl
@@ -0,0 +1,100 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("@io_bazel_rules_closure//closure/private:defs.bzl", "unfurl", "long_path")
+
+def _tensorboard_html_binary(ctx):
+  deps = unfurl(ctx.attr.deps, provider="webfiles")
+  manifests = set(order="link")
+  files = set()
+  for dep in deps:
+    manifests += dep.webfiles.manifests
+    files += dep.data_runfiles.files
+
+  # vulcanize
+  ctx.action(
+      inputs=list(manifests + files),
+      outputs=[ctx.outputs.html],
+      executable=ctx.executable._Vulcanize,
+      arguments=([ctx.attr.input_path,
+                  ctx.attr.output_path,
+                  ctx.outputs.html.path] +
+                 [m.path for m in manifests]),
+      progress_message="Vulcanizing %s" % ctx.attr.input_path)
+
+  # webfiles manifest
+  manifest_srcs = [struct(path=ctx.outputs.html.path,
+                          longpath=long_path(ctx, ctx.outputs.html),
+                          webpath=ctx.attr.output_path)]
+  manifest = ctx.new_file(ctx.configuration.bin_dir,
+                          "%s.pbtxt" % ctx.label.name)
+  ctx.file_action(
+      output=manifest,
+      content=struct(
+          label=str(ctx.label),
+          src=manifest_srcs).to_proto())
+  manifests += [manifest]
+
+  # webfiles server
+  params = struct(
+      label=str(ctx.label),
+      bind="[::]:6006",
+      manifest=[long_path(ctx, man) for man in manifests],
+      external_asset=[struct(webpath=k, path=v)
+                      for k, v in ctx.attr.external_assets.items()])
+  params_file = ctx.new_file(ctx.configuration.bin_dir,
+                             "%s_server_params.pbtxt" % ctx.label.name)
+  ctx.file_action(output=params_file, content=params.to_proto())
+  ctx.file_action(
+      executable=True,
+      output=ctx.outputs.executable,
+      content="#!/bin/sh\nexec %s %s" % (
+          ctx.executable._WebfilesServer.short_path,
+          long_path(ctx, params_file)))
+
+  transitive_runfiles = set()
+  transitive_runfiles += ctx.attr._WebfilesServer.data_runfiles.files
+  for dep in deps:
+    transitive_runfiles += dep.data_runfiles.files
+  return struct(
+      files=set([ctx.outputs.html]),
+      runfiles=ctx.runfiles(
+          files=ctx.files.data + [manifest,
+                                  params_file,
+                                  ctx.outputs.html,
+                                  ctx.outputs.executable],
+          transitive_files=transitive_runfiles))
+
+tensorboard_html_binary = rule(
+    implementation=_tensorboard_html_binary,
+    executable=True,
+    attrs={
+        "input_path": attr.string(mandatory=True),
+        "output_path": attr.string(mandatory=True),
+        "data": attr.label_list(cfg="data", allow_files=True),
+        "deps": attr.label_list(providers=["webfiles"], mandatory=True),
+        "external_assets": attr.string_dict(default={"/_/runfiles": "."}),
+        "_Vulcanize": attr.label(
+            default=Label("//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:Vulcanize"),
+            executable=True,
+            cfg="host"),
+        "_WebfilesServer": attr.label(
+            default=Label(
+                "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles/server:WebfilesServer"),
+            executable=True,
+            cfg="host"),
+    },
+    outputs={
+        "html": "%{name}.html",
+    })
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index ddffabd8cb371fbb802dd2485983ca9cffb9f7f6..348745f8d2bb0c40f9c1e9c3d7630b463257d66c 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -241,7 +241,7 @@ def tf_gen_op_wrapper_cc(name,
 #            hdrs = [ "ops/array_ops_internal.h",
 #                     "ops/math_ops_internal.h" ],
 #            deps = [ ... ])
-# TODO(josh11b): Cleaner approach for hidden ops.
+# TODO(joshl): Cleaner approach for hidden ops.
 def tf_gen_op_wrappers_cc(name,
                           op_lib_names=[],
                           other_srcs=[],
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
index 805a9bdd4f121bb1174882d863992e7e97095b62..da6af3919e96bd6145c33a84aca89c44473ce66c 100644
--- a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "ALLOW_SOFT_PLACEMENT_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "CLUSTER_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "DESCRIPTOR"
     mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt b/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt
index 0f43a49ee9675616b79f8da6d1c03735ece15e25..64240f706983bb2ced63e49937800d2db4e627f2 100644
--- a/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt
@@ -38,6 +38,10 @@ tf_class {
     name: "traceback"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "traceback_with_start_lines"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "type"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index 32082fc10bb1b7835314043024ea2659c43c28ad..d69c475a313075a5b165dba9a80e30cf8212657d 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -68,5 +68,10 @@ tf_class {
   }
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "replace"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4c633a850f8e069135f122292bac019e2646aa61
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.feature_column"
+tf_module {
+  member_method {
+    name: "bucketized_column"
+    argspec: "args=[\'source_column\', \'boundaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "categorical_column_with_hash_bucket"
+    argspec: "args=[\'key\', \'hash_bucket_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "categorical_column_with_identity"
+    argspec: "args=[\'key\', \'num_buckets\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "categorical_column_with_vocabulary_file"
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "categorical_column_with_vocabulary_list"
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+  }
+  member_method {
+    name: "crossed_column"
+    argspec: "args=[\'keys\', \'hash_bucket_size\', \'hash_key\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "embedding_column"
+    argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "indicator_column"
+    argspec: "args=[\'categorical_column\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "input_layer"
+    argspec: "args=[\'features\', \'feature_columns\', \'weight_collections\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "linear_model"
+    argspec: "args=[\'features\', \'feature_columns\', \'units\', \'sparse_combiner\', \'weight_collections\', \'trainable\'], varargs=None, keywords=None, defaults=[\'1\', \'sum\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "make_parse_example_spec"
+    argspec: "args=[\'feature_columns\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "numeric_column"
+    argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "weighted_categorical_column"
+    argspec: "args=[\'categorical_column\', \'weight_feature_key\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
index 6ca38e259bf0d8cbe0a644d891f84e0134fde1f2..78b10c44a23c8a1093b1727eb7fb7efae87a33cd 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "conv3d"
     argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'(1, 1, 1)\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "conv3d_transpose"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
   member_method {
     name: "dense"
     argspec: "args=[\'inputs\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
index 192ceac2ddfaaf44e9c9db0e0d78c8fdc496584e..b1b60fbdcbb50bca8c0481d86ba620b982532ca6 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -70,7 +70,7 @@ tf_module {
   }
   member_method {
     name: "conv3d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'None\', \'None\'], "
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
   }
   member_method {
     name: "convolution"
@@ -90,7 +90,7 @@ tf_module {
   }
   member_method {
     name: "ctc_loss"
-    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'True\'], "
+    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
   }
   member_method {
     name: "depthwise_conv2d"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index b51b0bf4afb92eccec967a53197c4cdc979dc1ae..f0fdd693903d89fb381300af58db21b400cb3c12 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -292,6 +292,10 @@ tf_module {
     name: "estimator"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "feature_column"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "flags"
     mtype: "<type \'module\'>"
@@ -638,7 +642,7 @@ tf_module {
   }
   member_method {
     name: "atan2"
-    argspec: "args=[\'x1\', \'x2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'y\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_to_space"
@@ -722,7 +726,7 @@ tf_module {
   }
   member_method {
     name: "cond"
-    argspec: "args=[\'pred\', \'fn1\', \'fn2\', \'strict\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'strict\', \'name\', \'fn1\', \'fn2\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "confusion_matrix"
@@ -894,27 +898,27 @@ tf_module {
   }
   member_method {
     name: "fake_quant_with_min_max_args"
-    argspec: "args=[\'inputs\', \'min\', \'max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "fake_quant_with_min_max_args_gradient"
-    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "fake_quant_with_min_max_vars"
-    argspec: "args=[\'inputs\', \'min\', \'max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "fake_quant_with_min_max_vars_gradient"
-    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "fake_quant_with_min_max_vars_per_channel"
-    argspec: "args=[\'inputs\', \'min\', \'max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "fake_quant_with_min_max_vars_per_channel_gradient"
-    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "fft"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
index feb73bd7d4f4b066e6112d3f722d4707cf9e53ce..93ff856b09de15f12954bb11802a935b82c1d278 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.train.ClusterDef"
 tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.tensorflow_server_pb2.ClusterDef\'>"
+  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.ClusterDef\'>"
   is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
   member {
     name: "DESCRIPTOR"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
index 2d7fcbe5456ffd87fb15b896e07fa223786ede24..ac6d81541a43e934ebd131afe07be0bd6e427a7b 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.train.JobDef.TasksEntry"
 tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.tensorflow_server_pb2.TasksEntry\'>"
+  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.TasksEntry\'>"
   is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
   member {
     name: "DESCRIPTOR"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
index fc5b76341d23d3c75a3f6c4cd49ec6eb7e48dde0..ce34537fa13b92f7900128d769ac3161d2b4d287 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.train.JobDef"
 tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.tensorflow_server_pb2.JobDef\'>"
+  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.JobDef\'>"
   is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
   member {
     name: "DESCRIPTOR"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
index 21234fe73913a79e040c22365707cbdfb15556f3..62b956c5ef7dc54e92431f25ec948e341c0e1f24 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
@@ -36,7 +36,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\', \'copy_from_scaffold\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "finalize"
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index c2e41e49187fbbb118dc05f85d3aae7e0d76efdb..8c480f8d9dbecb04a24bdf4fa763c5df5a39ef15 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -334,8 +334,8 @@ int Main(int argc, char** argv) {
       Flag("show_memory", &show_memory, "whether to list stats by memory used"),
       Flag("memory_limit", &memory_limit,
            "how many items to show by memory used"),
-      Flag("show_type", &show_time, "whether to list stats by op type"),
-      Flag("show_summary", &show_time,
+      Flag("show_type", &show_type, "whether to list stats by op type"),
+      Flag("show_summary", &show_summary,
            "whether to show a summary of the stats"),
       Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
       Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md
index 1fa618e698fa2205dfdbe00168b4a5f3c0fe0a08..ad83669950f7b284860f84ce87855fe3e3b3e0a9 100644
--- a/tensorflow/tools/ci_build/README.md
+++ b/tensorflow/tools/ci_build/README.md
@@ -34,7 +34,11 @@ run continuous integration [ci.tensorflow.org](https://ci.tensorflow.org).
    ```bash
    tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
    ```
-
+   If you are using the Docker image on Windows or OS X, the Docker VM's default
+   memory limit may be too low to build TensorFlow. This can result in
+   strange-looking errors, e.g. the compilation may fail with `gcc: internal
+   compiler error: Killed (program cc1plus)`. Try increasing the memory limit in
+   the Docker preferences.
 
 
 ## Jobs
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index fd3eff4ee6c9481974ee0bab9d544db40cec7579..dfaf50eb4f9d1136c0939525925ac33e401d5b8c 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -333,21 +333,35 @@ fi
 OPT_FLAG=$(str_strip "${OPT_FLAG}")
 
 
-# Filter out benchmark tests if this is not a benchmarks job
+# 1) Filter out benchmark tests if this is not a benchmarks job;
+# 2) Filter out tests with the "nomac" tag if the build is on Mac OS X.
 EXTRA_ARGS=""
+IS_MAC=0
+if [[ "$(uname)" == "Darwin" ]]; then
+  IS_MAC=1
+fi
 if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
   ITEMS=(${TF_BUILD_APPEND_ARGUMENTS})
 
   for ITEM in "${ITEMS[@]}"; do
-    if [[ ${ITEM} == *"--test_tag_filters="* ]] &&
-      [[ ${ITEM} != *"benchmark-test"* ]]; then
-      EXTRA_ARGS="${EXTRA_ARGS} ${ITEM},-benchmark-test"
+    if [[ ${ITEM} == *"--test_tag_filters="* ]]; then
+      NEW_ITEM="${ITEM}"
+      if [[ ${NEW_ITEM} != *"benchmark-test"* ]]; then
+        NEW_ITEM="${NEW_ITEM},-benchmark-test"
+      fi
+      if [[ ${IS_MAC} == "1" ]] && [[ ${NEW_ITEM} != *"nomac"* ]]; then
+        NEW_ITEM="${NEW_ITEM},-nomac"
+      fi
+      EXTRA_ARGS="${EXTRA_ARGS} ${NEW_ITEM}"
     else
       EXTRA_ARGS="${EXTRA_ARGS} ${ITEM}"
     fi
   done
 else
   EXTRA_ARGS="${TF_BUILD_APPEND_ARGUMENTS} --test_tag_filters=-benchmark-test"
+  if [[ ${IS_MAC} == "1" ]]; then
+    EXTRA_ARGS="${EXTRA_ARGS},-nomac"
+  fi
 fi
 
 # For any "tool" dependencies in genrules, Bazel will build them for host
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 9ecf16c46f12f0193e4b4c4577e1743d724551e1..fd2874df91e03853648ccca6a0d7b3520da74d55 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -92,6 +92,8 @@ do_pylint() {
   ERROR_WHITELIST="^tensorflow/python/framework/function_test\.py.*\[E1123.*noinline "\
 "^tensorflow/python/platform/default/_gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/platform/default/_googletest\.py.*\[E0102.*function\salready\sdefined "\
+"^tensorflow/python/feature_column/feature_column_test\.py.*\[E0110.*abstract-class-instantiated "\
+"^tensorflow/contrib/layers/python/layers/feature_column\.py.*\[E0110.*abstract-class-instantiated "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
diff --git a/tensorflow/tools/ci_build/install/install_auditwheel.sh b/tensorflow/tools/ci_build/install/install_auditwheel.sh
index 2538a393d3f6ef27e96e9ac863d2e1dba1c8f930..e6f6124d56774a43e521d6529695f5abc161dabb 100755
--- a/tensorflow/tools/ci_build/install/install_auditwheel.sh
+++ b/tensorflow/tools/ci_build/install/install_auditwheel.sh
@@ -16,7 +16,7 @@
 
 set -e
 
-sudo pip3 install auditwheel
+sudo pip3 install auditwheel==1.5.0
 
 set +e
 patchelf_location=$(which patchelf)
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 6b160bbe032fd7d503e50c6c7d61fbd0c198ac8b..da1f2199d0daf5cfe3e9d94165e3af6704c58050 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -46,6 +46,7 @@ apt-get install -y --no-install-recommends \
     git \
     libcurl4-openssl-dev \
     libtool \
+    mlocate \
     openjdk-8-jdk \
     openjdk-8-jre-headless \
     pkg-config \
@@ -63,6 +64,9 @@ apt-get install -y --no-install-recommends \
     zip \
     zlib1g-dev
 
+# populate the database
+updatedb
+
 if [[ "$1" != "--without_cmake" ]]; then
   apt-get install -y --no-install-recommends \
     cmake
diff --git a/tensorflow/tools/ci_build/linux/cmake/run.sh b/tensorflow/tools/ci_build/linux/cmake/run.sh
old mode 100644
new mode 100755
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e5f4a22f7ade7eb5c260a7a486cd5d3fa75d5859
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(sysctl -n hw.ncpu)
+N_JOBS=$((N_JOBS+1))
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_CUDA=0
+export PYTHON_BIN_PATH=$(which python2)
+yes "" | ./configure
+which bazel
+bazel test --test_tag_filters=-gpu,-benchmark-test,-nomac \
+    --test_timeout 300,450,1200,3600 \
+    --test_size_filters=small,medium \
+    --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... \
+    -//tensorflow/tensorboard/...
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index 762c53172588246acf0350125e06799bd960269c..d90a1b905d91415dda576c5dc71df2f41502fa9d 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -28,6 +28,7 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export TF_NEED_OPENCL=0
+export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
 export PATH="/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index 1da5e8c2bf34cb8ae713b4bcde22f629ba47f877..79973647c11fffb1907b7f39fe5f43a3fb450b5b 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -29,6 +29,7 @@ export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_OPENCL=0
+export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
 export PATH="/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 1488e8d78c85053e49accac4cde3f72691f361f6..f76c1add242539bd02287c26c2867ffb2fc85123 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -175,6 +175,9 @@ function run_configure_for_cpu_build {
   if [ -z "$CC_OPT_FLAGS" ]; then
     export CC_OPT_FLAGS="-march=native"
   fi
+  if [ -z "$TF_NEED_MKL" ]; then
+    export TF_NEED_MKL=0
+  fi
   echo "" | ./configure
 }
 
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index b9937475219f4dd334121fde13b3be21b50bac3e..e4e386171028fe8d529fc6416de2df76ad093d84 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -34,6 +34,7 @@ export BAZEL_SH="C:/tools/msys64/usr/bin/bash"
 
 # Set Python path for ./configure
 export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python"
+export PYTHON_LIB_PATH="C:/Program Files/Anaconda3/lib/site-packages"
 
 # Set Python path for cc_configure.bzl
 export BAZEL_PYTHON="C:/Program Files/Anaconda3/python"
diff --git a/tensorflow/tools/common/public_api.py b/tensorflow/tools/common/public_api.py
index 837f11f690e3f9288d7e4d61e0faf51a11e794ea..cab3b2ff6a0d39938d915c53aff657e43c065c99 100644
--- a/tensorflow/tools/common/public_api.py
+++ b/tensorflow/tools/common/public_api.py
@@ -38,7 +38,7 @@ class PublicAPIVisitor(object):
     self._visitor = visitor
 
     # Modules/classes we do not want to descend into if we hit them. Usually,
-    # sytem modules exposed through platforms for compatibility reasons.
+    # system modules exposed through platforms for compatibility reasons.
     # Each entry maps a module path to a name to ignore in traversal.
     self._do_not_descend_map = {
         '': [
diff --git a/tensorflow/tools/dist_test/server/BUILD b/tensorflow/tools/dist_test/server/BUILD
index 9d008ec9ce5969b2cb0f61c958a606c04dda576d..865af8dd7b2af686dad852f35187f2d226533596 100644
--- a/tensorflow/tools/dist_test/server/BUILD
+++ b/tensorflow/tools/dist_test/server/BUILD
@@ -9,7 +9,7 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-py_library(
+py_binary(
     name = "grpc_tensorflow_server",
     srcs = [
         "grpc_tensorflow_server.py",
diff --git a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
old mode 100755
new mode 100644
index 2d774577b6d93ef7712d3595ab6592a5a701b14d..bd6700a0b1f43208b317e14953c1110cbe39248b
--- a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
+++ b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
@@ -36,6 +36,7 @@ from __future__ import print_function
 import argparse
 import sys
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.platform import app
 from tensorflow.python.training import server_lib
@@ -103,8 +104,11 @@ def main(unused_args):
     raise ValueError("Invalid task_id: %d" % FLAGS.task_id)
   server_def.task_index = FLAGS.task_id
 
+  config = config_pb2.ConfigProto(gpu_options=config_pb2.GPUOptions(
+      per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction))
+
   # Create GRPC Server instance
-  server = server_lib.Server(server_def)
+  server = server_lib.Server(server_def, config=config)
 
   # join() is blocking, unlike start()
   server.join()
@@ -137,6 +141,11 @@ if __name__ == "__main__":
       default=0,
       help="Task index, e.g., 0"
   )
+  parser.add_argument(
+      "--gpu_memory_fraction",
+      type=float,
+      default=1.0,
+      help="Fraction of GPU memory allocated",)
   parser.add_argument(
       "--verbose",
       type="bool",
@@ -145,5 +154,6 @@ if __name__ == "__main__":
       default=False,
       help="Verbose mode"
   )
+
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index bfac54c601949e906211cb6802b3d59d256f6453..c801ceff9387b1a896a979dd71292816392d0534 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip \
         zip \
         zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -46,18 +48,6 @@ COPY run_jupyter.sh /
 
 # Set up Bazel.
 
-# We need to add a custom PPA to pick up JDK8, since trusty doesn't
-# have an openjdk8 backport.  openjdk-r is maintained by a reliable contributor:
-# Matthias Klose (https://launchpad.net/~doko).  It will do until
-# we either update the base image beyond 14.04 or openjdk-8 is
-# finally backported to trusty; see e.g.
-#   https://bugs.launchpad.net/trusty-backports/+bug/1368094
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends openjdk-8-jdk openjdk-8-jre-headless && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 # Running bazel inside a `docker build` command causes trouble, cf:
 #   https://github.com/bazelbuild/bazel/issues/134
 # The easiest solution is to set up a bazelrc file forcing --batch.
@@ -92,7 +82,8 @@ WORKDIR /tensorflow
 ENV CI_BUILD_PYTHON python
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel build -c opt tensorflow/tools/pip_package:build_pip_package && \
+    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+        tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
     pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 7726cbdfbf84acf57fb850f1eb44c5593dcc869b..24350c507e7f9fb00954293989e70557ea02a192 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip \
         zip \
         zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -46,18 +48,6 @@ COPY run_jupyter.sh /
 
 # Set up Bazel.
 
-# We need to add a custom PPA to pick up JDK8, since trusty doesn't
-# have an openjdk8 backport.  openjdk-r is maintained by a reliable contributor:
-# Matthias Klose (https://launchpad.net/~doko).  It will do until
-# we either update the base image beyond 14.04 or openjdk-8 is
-# finally backported to trusty; see e.g.
-#   https://bugs.launchpad.net/trusty-backports/+bug/1368094
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends openjdk-8-jdk openjdk-8-jre-headless && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 # Running bazel inside a `docker build` command causes trouble, cf:
 #   https://github.com/bazelbuild/bazel/issues/134
 # The easiest solution is to set up a bazelrc file forcing --batch.
@@ -92,7 +82,8 @@ ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1
 
 RUN tensorflow/tools/ci_build/builds/configured GPU \
-    bazel build -c opt --config=cuda tensorflow/tools/pip_package:build_pip_package && \
+    bazel build -c opt --config=cuda --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+        tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
     pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
diff --git a/tensorflow/tools/docs/doc_generator_visitor.py b/tensorflow/tools/docs/doc_generator_visitor.py
index 119305bece345ad5f0c745ec582a6ae236f6e118..8f7b91fa752f9e594176a6fcb02da1fc8f9bc103 100644
--- a/tensorflow/tools/docs/doc_generator_visitor.py
+++ b/tensorflow/tools/docs/doc_generator_visitor.py
@@ -170,7 +170,7 @@ class DocGeneratorVisitor(object):
     master names to a lexicographically sorted list of all aliases for that name
     (incl. the master name).
 
-    All these are computed and set as fields if they haven't aready.
+    All these are computed and set as fields if they haven't already.
     """
     if self._reverse_index is not None:
       return
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 8b53185943707e86b5d8345536be00691a8b4091..3494c7f8c5a9d71967fea0137a2f81a38cfdbd38 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -64,8 +64,16 @@ def write_docs(output_dir, parser_config, yaml_toc):
     parser_config: A `parser.ParserConfig` object, containing all the necessary
       indices.
     yaml_toc: Set to `True` to generate a "_toc.yaml" file.
+
+  Raises:
+    ValueError: if `output_dir` is not an absolute path
   """
   # Make output_dir.
+  if not os.path.isabs(output_dir):
+    raise ValueError(
+        "'output_dir' must be an absolute path.\n"
+        "    output_dir='%s'" % output_dir)
+
   try:
     if not os.path.exists(output_dir):
       os.makedirs(output_dir)
@@ -148,7 +156,8 @@ def write_docs(output_dir, parser_config, yaml_toc):
                 + '\n')
 
         symbols_in_module = module_children.get(module, [])
-        symbols_in_module.sort(key=lambda a: a.upper())
+        # Sort case-insensitive, if equal sort case sensitive (upper first)
+        symbols_in_module.sort(key=lambda a: (a.upper(), a))
 
         for full_name in symbols_in_module:
           f.write('    - title: ' + full_name[len(module) + 1:] + '\n'
@@ -189,7 +198,6 @@ def _get_default_do_not_descend_map():
           'tensor_forest',
           'tensorboard',
           'testing',
-          'training',
           'tfprof',
       ],
       'contrib.bayesflow': [
@@ -257,6 +265,14 @@ class _DocInfo(object):
 def build_doc_index(src_dir):
   """Build an index from a keyword designating a doc to _DocInfo objects."""
   doc_index = {}
+  if not os.path.isabs(src_dir):
+    raise ValueError("'src_dir' must be an absolute path.\n"
+                     "    src_dir='%s'" % src_dir)
+
+  if not os.path.exists(src_dir):
+    raise ValueError("'src_dir' path must exist.\n"
+                     "    src_dir='%s'" % src_dir)
+
   for dirpath, _, filenames in os.walk(src_dir):
     suffix = os.path.relpath(path=dirpath, start=src_dir)
     for base_name in filenames:
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index 824f46170ed0e41103dfa5e689bb0236a69bdb6e..918d475e0d7cc186bf03585455e5f0115de61b82 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -213,6 +213,14 @@ def _build_module_page(page_info):
 
 def _build_signature(obj_info):
   """Returns a md code block showing the function signature."""
+  # Special case tf.range, since it has an optional first argument
+  if obj_info.full_name == 'tf.range':
+    return (
+        '``` python\n'
+        "range(limit, delta=1, dtype=None, name='range')\n"
+        "range(start, limit, delta=1, dtype=None, name='range')\n"
+        '```\n\n')
+
   signature_template = '\n'.join([
       '``` python',
       '{name}({sig})',
@@ -230,7 +238,7 @@ def _build_signature(obj_info):
 
 
 def _build_compatibility(compatibility):
-  """Return the compatability section as an md string."""
+  """Return the compatibility section as an md string."""
   parts = []
   sorted_keys = sorted(compatibility.keys())
   for key in sorted_keys:
diff --git a/tensorflow/tools/docs/py_guide_parser.py b/tensorflow/tools/docs/py_guide_parser.py
index 3ca6d11b8478c8fcfc00e5f4dc3a93ee0f9d6a15..245643cb32e9a73a72838bdc3f2a4b8ab7c839d8 100644
--- a/tensorflow/tools/docs/py_guide_parser.py
+++ b/tensorflow/tools/docs/py_guide_parser.py
@@ -34,7 +34,7 @@ def md_files_in_dir(py_guide_src_dir):
 class PyGuideParser(object):
   """Simple parsing of a guide .md file.
 
-  Decendents can override the process_*() functions (called by process())
+  Descendants can override the process_*() functions (called by process())
   to either record infromation from the guide, or call replace_line()
   to affect the return value of process().
   """
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index 1fbd636db3326df76148205aee71a695ce8eae10..7e8c51efe6ac62fe698e44dbe9fd15fc7cb8dfe3 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -81,10 +81,10 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --out_graph=optimized_inception_graph.pb \
 --inputs='Mul:0' \
 --outputs='softmax:0' \
---transforms='\
-strip_unused_nodes(type=float, shape="1,299,299,3") \
-remove_nodes(op=Identity, op=CheckNumerics) \
-fold_old_batch_norms \
+--transforms='
+strip_unused_nodes(type=float, shape="1,299,299,3")
+remove_nodes(op=Identity, op=CheckNumerics)
+fold_old_batch_norms
 '
 ```
 
@@ -94,7 +94,10 @@ transforms to modify the graph with. The transforms are given as a list of
 names, and can each have arguments themselves. These transforms define the
 pipeline of modifications that are applied in order to produce the output.
 Sometimes you need some transforms to happen before others, and the ordering
-within the list lets you specify which happen first.
+within the list lets you specify which happen first. 
+Note that the optimization 
+`remove_nodes(op=Identity, op=CheckNumerics)` will break the model with control 
+flow operations, such as `tf.cond`, `tf.map_fn`, and `tf.while`.
 
 ## Inspecting Graphs
 
@@ -212,7 +215,7 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --out_graph=optimized_inception_graph.pb \
 --inputs='Mul' \
 --outputs='softmax' \
---transforms='\
+--transforms='
   strip_unused_nodes(type=float, shape="1,299,299,3")
   fold_constants(ignore_errors=true)
   fold_batch_norms
@@ -428,12 +431,11 @@ graph:
 ```bash
 bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
---logtostderr \
 --in_graph=/tmp/quantized_inception.pb \
 --out_graph=/tmp/logged_quantized_inception.pb \
 --inputs=Mul \
 --outputs=softmax \
---transforms='\
+--transforms='
 insert_logging(op=RequantizationRange, show_name=true, message="__requant_min_max:")\
 '
 ```
@@ -447,12 +449,10 @@ log:
 bazel build tensorflow/examples/label_image:label_image
 bazel-bin/tensorflow/examples/label_image/label_image \
 --image=${HOME}/Downloads/grace_hopper.jpg \
---logtostderr \
 --input_layer=Mul \
 --output_layer=softmax \
 --graph=/tmp/logged_quantized_inception.pb \
 --labels=${HOME}/Downloads/imagenet_comp_graph_label_strings.txt \
---logtostderr \
 2>/tmp/min_max_log_small.txt
 ```
 
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 466e61b42dc0e86aa1223d746222852ea32b7c76..6f44da7ee0fa25ec0ff1676def507db94e092b88 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -152,9 +152,9 @@ Status FoldConstants(const GraphDef& input_graph_def,
       &input_graph, context.input_names, context.output_names, {},
       device_attributes, false /* use_function_convention */, &metadata));
   bool was_mutated;
-  TF_RETURN_IF_ERROR(DoConstantFoldingWithStatus(
-      ConstantFoldingOptions(), nullptr, Env::Default(), nullptr, &input_graph,
-      &was_mutated));
+  TF_RETURN_IF_ERROR(ConstantFold(ConstantFoldingOptions(), nullptr,
+                                  Env::Default(), nullptr, &input_graph,
+                                  &was_mutated));
   GraphDef folded_graph_def;
   input_graph.ToGraphDef(&folded_graph_def);
   GraphDef send_recvs_replaced;
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index dac13f5c3214044d8829e19f9735ff7b600d2f22..902f92952a6405ad6eed3f61364f6e127bfda8cb 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -72,9 +72,9 @@ class ConstantFoldingTest : public ::testing::Test {
                         {"output_expect_remains"});
   }
 
-  void TestConstantFolding(const GraphDef graph_def,
+  void TestConstantFolding(const GraphDef& graph_def,
                            std::vector<std::pair<string, Tensor> > inputs,
-                           std::vector<string> outputs) {
+                           const std::vector<string>& outputs) {
     std::unique_ptr<tensorflow::Session> unfolded_session(
         tensorflow::NewSession(tensorflow::SessionOptions()));
     TF_ASSERT_OK(unfolded_session->Create(graph_def));
diff --git a/tensorflow/tools/graph_transforms/summarize_graph_main.cc b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
index f45dfbba0ced3546c0eae498f08b1dd25d90c80e..e49257804575be11a6e9a7ddb223cece2ced9a18 100644
--- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc
+++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
@@ -50,8 +50,8 @@ void PrintNodeInfo(const NodeDef* node) {
   std::cout << ", shape=" << shape.DebugString() << ") ";
 }
 
-void PrintBenchmarkUsage(const std::vector<const NodeDef*> placeholders,
-                         const std::vector<const NodeDef*> variables,
+void PrintBenchmarkUsage(const std::vector<const NodeDef*>& placeholders,
+                         const std::vector<const NodeDef*>& variables,
                          const std::vector<const NodeDef*> outputs,
                          const string& graph_path) {
   std::vector<const NodeDef*> all_inputs(placeholders);
@@ -94,7 +94,6 @@ void PrintBenchmarkUsage(const std::vector<const NodeDef*> placeholders,
   std::cout << "bazel run tensorflow/tools/benchmark:benchmark_model --";
   std::cout << " --graph=" << graph_path;
   std::cout << " --show_flops";
-  std::cout << " --logtostderr";
   std::cout << " --input_layer=" << input_layer_value;
   std::cout << " --input_layer_type=" << input_layer_type_value;
   std::cout << " --input_layer_shape=" << input_layer_shape_value;
@@ -102,7 +101,18 @@ void PrintBenchmarkUsage(const std::vector<const NodeDef*> placeholders,
   std::cout << std::endl;
 }
 
-Status SummarizeGraph(const GraphDef& graph, const string& graph_path) {
+Status PrintStructure(const GraphDef& graph) {
+  GraphDef sorted_graph;
+  TF_RETURN_IF_ERROR(SortByExecutionOrder(graph, &sorted_graph));
+  for (const NodeDef& node : sorted_graph.node()) {
+    std::cout << node.name() << " (" << node.op() << "): ["
+              << str_util::Join(node.input(), ", ") << "]" << std::endl;
+  }
+  return Status::OK();
+}
+
+Status SummarizeGraph(const GraphDef& graph, const string& graph_path,
+                      bool print_structure) {
   std::vector<const NodeDef*> placeholders;
   std::vector<const NodeDef*> variables;
   for (const NodeDef& node : graph.node()) {
@@ -233,13 +243,20 @@ Status SummarizeGraph(const GraphDef& graph, const string& graph_path) {
 
   PrintBenchmarkUsage(placeholders, variables, outputs, graph_path);
 
+  if (print_structure) {
+    TF_RETURN_IF_ERROR(PrintStructure(graph));
+  }
+
   return Status::OK();
 }
 
 int ParseFlagsAndSummarizeGraph(int argc, char* argv[]) {
   string in_graph = "";
+  bool print_structure = false;
   std::vector<Flag> flag_list = {
       Flag("in_graph", &in_graph, "input graph file name"),
+      Flag("print_structure", &print_structure,
+           "whether to print the network connections of the graph"),
   };
   string usage = Flags::Usage(argv[0], flag_list);
 
@@ -269,7 +286,8 @@ int ParseFlagsAndSummarizeGraph(int argc, char* argv[]) {
     return -1;
   }
 
-  Status summarize_result = SummarizeGraph(graph_def, in_graph);
+  Status summarize_result =
+      SummarizeGraph(graph_def, in_graph, print_structure);
   if (!summarize_result.ok()) {
     LOG(ERROR) << summarize_result.error_message() << "\n" << usage;
     return -1;
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index a1c4073fb99e264b4c026a55db8c6510ef072996..b8bf2dc0901425bbae3a3e9245e77e0e8dc6d2b4 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -48,6 +48,11 @@ Status ParseTransformParameters(const string& transforms_string,
       func_parameters.clear();
       // Eat up any leading spaces.
       Scanner(remaining).AnySpace().GetResult(&remaining, &match);
+      if (remaining.empty()) {
+        // Nothing remains after consuming trailing spaces.
+        // Consumed all transform parameter string without errors.
+        return Status::OK();
+      }
       // See if we have a valid transform name.
       const bool found_transform_name =
           Scanner(remaining)
diff --git a/tensorflow/tools/graph_transforms/transform_graph_test.cc b/tensorflow/tools/graph_transforms/transform_graph_test.cc
index dd60b998543f983df411ca7235ea058cd48a370a..bc2412fcbdba90731318eea1a2239aa914b35ffc 100644
--- a/tensorflow/tools/graph_transforms/transform_graph_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph_test.cc
@@ -205,6 +205,19 @@ class TransformGraphTest : public ::testing::Test {
     EXPECT_EQ(0, params_list.size());
   }
 
+  void TestParseExtraSpaces() {
+    TransformParameters params_list;
+    ParseTransformParameters(" ", &params_list).IgnoreError();
+    EXPECT_EQ(0, params_list.size());
+
+    TF_EXPECT_OK(ParseTransformParameters("  foo bar \\\n", &params_list));
+    EXPECT_EQ(2, params_list.size());
+    EXPECT_EQ("foo", params_list[0].first);
+    EXPECT_TRUE(params_list[0].second.empty());
+    EXPECT_EQ("bar", params_list[1].first);
+    EXPECT_TRUE(params_list[1].second.empty());
+  }
+
   void TestShouldIgnoreErrors() {
     bool ignore_errors;
     TF_EXPECT_OK(
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c17a7f7fb19650c3b89f8e6608e7de34fb2a9fe2..83be430e7d6ebe381a892a8e7284685125948268 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -68,6 +68,7 @@ py_binary(
         ":included_headers",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/session_bundle:session_bundle_pip",
+        "//tensorflow/contrib/signal:signal_py",
         "//tensorflow/contrib/slim/python/slim/data:data_pip",
         "//tensorflow/python:util_example_parser_configuration",
         "//tensorflow/python/debug:debug_pip",
@@ -141,6 +142,7 @@ sh_binary(
             "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/session_bundle:session_bundle_pip",
+            "//tensorflow/contrib/signal:signal_py",
             "//tensorflow/contrib/slim:slim",
             "//tensorflow/contrib/slim/python/slim/data:data_pip",
             "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 61c3fe554057c63b4be86746d122cda871c15c21..459d6ee3284f6f984b83601ceda44a68cbaaba8b 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -28,11 +28,13 @@ import subprocess
 PIP_PACKAGE_QUERY = """bazel query \
   'deps(//tensorflow/tools/pip_package:build_pip_package)'"""
 
-PY_TEST_QUERY = """bazel query 'filter("^((?!(benchmark|manual|no_pip)).)*$", \
-  deps(kind(py_test,\
-  //tensorflow/python/... + \
-  //tensorflow/tensorboard/... + \
-  //tensorflow/contrib/...), 1))'"""
+PY_TEST_QUERY = """bazel query 'deps(\
+  filter("^((?!benchmark).)*$",\
+  kind(py_test,\
+  //tensorflow/python/... \
+  + //tensorflow/tensorboard/... \
+  + //tensorflow/contrib/... \
+  - attr(tags, "manual|no_pip", //tensorflow/...))), 1)'"""
 
 # Hard-coded blacklist of files if not included in pip package
 # TODO(amitpatankar): Clean up blacklist.
@@ -45,6 +47,7 @@ BLACKLIST = [
     "//tensorflow/python:compare_test_proto_py",
     "//tensorflow/core:image_testdata",
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
+    "//tensorflow/python/feature_column:vocabulary_testdata",
     "//tensorflow/python:framework/test_file_system.so",
     # contrib
     "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
@@ -54,7 +57,7 @@ BLACKLIST = [
     "//tensorflow/contrib/factorization/examples:mnist.py",
     "//tensorflow/contrib/factorization:factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",  # pylint:disable=line-too-long
     "//tensorflow/contrib/bayesflow:reinforce_simple_example",
-    "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py"  # pylint:disable=line-too-long
+    "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
 ]
 
 
@@ -121,7 +124,10 @@ def main():
       affected_tests_list = affected_tests.split("\n")[:-2]
       print("\n".join(affected_tests_list))
 
-    raise RuntimeError("One or more dependencies are not in the pip package.")
+    raise RuntimeError("""One or more dependencies are not in the pip package.
+Please either blacklist the dependencies in
+tensorflow/tensorflow/tensorflow/tools/pip_package/pip_smoke_test.py
+or add them to tensorflow/tensorflow/tensorflow/tools/pip_package/BUILD.""")
 
   else:
     print("TEST PASSED")
diff --git a/tensorflow/tools/quantization/quantize_graph.py b/tensorflow/tools/quantization/quantize_graph.py
index 90f1ab4d66e233fcd19cec3bc040124f402810d0..a0cfc352d4f65a32dde13893dc937a72d7434e28 100644
--- a/tensorflow/tools/quantization/quantize_graph.py
+++ b/tensorflow/tools/quantization/quantize_graph.py
@@ -453,7 +453,8 @@ class GraphRewriter(object):
 
   def round_nodes_recursively(self, current_node):
     """The entry point for simple rounding quantization."""
-    if (current_node.name in self.already_visited) and self.already_visited[current_node.name]:
+    if (current_node.name in self.already_visited
+       ) and self.already_visited[current_node.name]:
       return
     self.already_visited[current_node.name] = True
     for input_node_name in current_node.input:
diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index ebb527bc09526b12b813332fc45bf28a0207fac6..e803d5cdacb5efcaf35577ec4b09e0340b315906 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import re
 import shlex
 import subprocess
 import tempfile
@@ -26,6 +27,7 @@ import time
 
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.platform import gfile
+from tensorflow.tools.test import gpu_info_lib
 from tensorflow.tools.test import system_info_lib
 
 
@@ -93,7 +95,8 @@ def process_benchmarks(log_files):
   return benchmarks
 
 
-def run_and_gather_logs(name, test_name, test_args, benchmark_type):
+def run_and_gather_logs(name, test_name, test_args,
+                        benchmark_type):
   """Run the bazel test given by test_name.  Gather and return the logs.
 
   Args:
@@ -148,8 +151,17 @@ def run_and_gather_logs(name, test_name, test_args, benchmark_type):
     if not log_files:
       raise MissingLogsError("No log files found at %s." % test_file_prefix)
 
+    test_adjusted_name = name
+    gpu_config = gpu_info_lib.gather_gpu_devices()
+    if gpu_config:
+      gpu_name = gpu_config[0].model
+      gpu_short_name_match = re.search(r"Tesla [KP][4,8]0", gpu_name)
+      if gpu_short_name_match:
+        gpu_short_name = gpu_short_name_match.group(0)
+        test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_")
+
     return (process_test_logs(
-        name,
+        test_adjusted_name,
         test_name=test_name,
         test_args=test_args,
         benchmark_type=benchmark_type,
diff --git a/tensorflow/tools/tfprof/README.md b/tensorflow/tools/tfprof/README.md
index 540e179aaee5ed05faf5800a352a56b930ed619b..69f09411a9c419f167d1e76cd0491396ec8aedb5 100644
--- a/tensorflow/tools/tfprof/README.md
+++ b/tensorflow/tools/tfprof/README.md
@@ -1,6 +1,6 @@
 # tfprof: A Profiling Tool for TensorFlow Models
 
-Author: Xin Pan (xpan@google.com, github: panyx0718)
+Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
 
 Consultants: Jon Shlens, Pete Warden
 
@@ -8,14 +8,26 @@ Consultants: Jon Shlens, Pete Warden
 ###Major Features
 
 1.  Measure model parameters, float operations, tensor shapes.
-2.  Measure op execution times, requested memory size and device placement.
+2.  Profile op execution times, requested memory size and device placement.
 3.  Inspect checkpoint tensors' shapes and their values.
-4.  Explore model based on name scope or graph structure.
-5.  Selectively grouping/filtering/accounting/ordering ops.
+4.  Selectively group, filter, account and order ops.
+
+####tfprof supports 3 views to organize TensorFlow model profiles
+
+    *  code view: Stats are associated your Python codes and organized as call stacks.
+    *  scope view: Stats are organized as name scope hierarchies.
+    *  graph view: Stats are organized as Tensorflow Op graph.
+
+####For each view, there are 3 ways to display outputs:
+
+    *  stdout: Results are written to stdout.
+    *  timeline: Visualized in chrome browser as time series.
+    *  file: Results are dumped to file.
+
 
 [Python API Tutorials](#python-api-tutorials): It can be called directly from
 Python codes. Results are either printed
-to stdout or dumped to file. tensorflow.tfprof.TFProfNode proto is returned from
+to stdout or dumped to file. tensorflow.tfprof.TFGraphNodeProto proto is returned from
 the API to allow users to perform further analysis.
 
 [CLI Tutorials](#cli-tutorials):
@@ -33,13 +45,23 @@ tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
 ### Examine the shapes and sizes of all trainable Variables.
 ```python
 # Print trainable variable parameter statistics to stdout.
+# By default, statistics are associated with each graph node.
 param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
     tf.get_default_graph(),
     tfprof_options=tf.contrib.tfprof.model_analyzer.
         TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
 
-# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
-# of each graph node in tree scructure. Let's print the root below.
+
+# Set tfprof_cmd='code' to associate statistics with Python codes.
+opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
+opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*']
+param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    tfprof_cmd='code'
+    tfprof_options=opts)
+
+# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
+# Let's print the root below.
 sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 ```
 
@@ -68,13 +90,11 @@ compute the memory and timing statistics.
 #
 # Note: When run on GPU, a kernel is first scheduled (enqueued) and then
 #       executed asynchronously. tfprof only tracks the execution time.
-#       Which is from proto CostGraphDef::Node::compute_cost.
 #       In addition, a substantial of time might be spent between Python and
 #       TensorFlow runtime, which is also not tracked by tfprof.
 #
-config = tf.ConfigProto(graph_options=tf.GraphOptions(build_cost_model=1))
 run_metadata = tf.RunMetadata()
-with tf.Session(config=config) as sess:
+with tf.Session() as sess:
   _ = sess.run(train_op,
                options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
                run_metadata=run_metadata)
@@ -84,8 +104,20 @@ Finally, you may run `print_model_analysis` to explore the timing and memory
 demands of the model.
 
 ``` python
+# See model_analyzer_test.py for more examples.
+#
 # Print to stdout an analysis of the memory usage and the timing information
-# from running the graph broken down by operations.
+# broken down by python codes.
+opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
+opts['show_name_regexes'] = ['.*my_code.py.*']
+tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    run_meta=run_metadata,
+    tfprof_cmd='code',
+    tfprof_options=opts)
+
+# Print to stdout an analysis of the memory usage and the timing information
+# broken down by operations.
 tf.contrib.tfprof.model_analyzer.print_model_analysis(
     tf.get_default_graph(),
     run_meta=run_metadata,
@@ -94,6 +126,18 @@ tf.contrib.tfprof.model_analyzer.print_model_analysis(
 
 Users can change ```tfprof_options``` to fully leverage tfprof's power.
 
+```
+For example set opts['output'] = 'timeline:outfile=<filename>' to
+generate a timeline json file. Open a Chrome Browser, open URL
+chrome://tracing, and load the json file. Below are 2 examples of graph
+view and scope view. See code view example in later examples.
+```
+
+<left>
+![CodeTimeline](g3doc/graph_timeline.png)
+![CodeTimeline](g3doc/scope_timeline.png)
+</left>
+
 
 ## CLI Tutorials
 
@@ -138,9 +182,9 @@ bazel-bin/tensorflow/tools/tfprof/tfprof \
     --run_meta_path=run_meta \
     --checkpoint_path=model.ckpt
 #
-# tfprof_log is used to define customized op types and float ops.
+# tfprof_log is used to define customized op types, float ops and code traces.
 # Use tfprof_logger.write_op_log() to create tfprof_log.
-# See 11) in Examples section on generating tfprof_log file.
+# See 12) in Examples section on generating tfprof_log file.
 bazel-bin/tensorflow/tools/tfprof/tfprof \
     --graph_path=graph.pbtxt \
     --run_meta_path=run_meta \
@@ -170,11 +214,44 @@ tfprof>
 # supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
 # [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types]
 -select                     params
--viz                        false
--dump_to_file
+# format: output_type:key=value,key=value...
+# output_types: stdout (default), timeline, file.
+# key=value pairs:
+#   1. timeline: outfile=<filename>
+#   2. file: outfile=<filename>
+#   3. stdout: None.
+# E.g. timeline:outfile=/tmp/timeline.json
+-output
 ```
 
-3) I want to see the `BatchNorm`'s gamma value in checkpoint.
+3) I want to see which line of my python codes costs most time!
+
+```shell
+# Requires --graph_path --op_log_path
+tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
+_TFProfRoot (0us/22.44ms)
+  model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
+    model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
+      model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
+        model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
+          model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
+          model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
+          model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
+            model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
+            model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
+            ...
+          model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
+          model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
+        model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
+```
+
+Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout.
+<left>
+![CodeTimeline](g3doc/code_timeline.png)
+</left>
+
+
+4) I want to see the `BatchNorm`'s gamma value in checkpoint.
 
 ```shell
 # Requires --graph_path, --checkpoint_path.
@@ -186,7 +263,7 @@ _TFProfRoot ()
 [1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
 ```
 
-4) I want to see my checkpoint tensors shape and number of parameters.
+5) I want to see my checkpoint tensors shape and number of parameters.
 
 ```shell
 # Requires --graph_path, --checkpoint_path.
@@ -205,7 +282,7 @@ _TFProfRoot (--/930.58k params)
   unit_last/final_bn/moving_variance (64, 64/64 params)
 ```
 
-5) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
+6) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
 it depends on take a long time to run. Hint: Use the ‘graph’ command to explore
 graph dependencies.
 
@@ -221,7 +298,7 @@ _TFProfRoot (0us/3.61sec)
   unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
 ```
 
-6) I want to know the expensive operations during the back propagation.
+7) I want to know the expensive operations during the back propagation.
 Hint: tensorflow prepend ‘gradient’ to your defined name scopes. Use the ‘scope’
 command to explore based on name scope hierarchies.
 
@@ -238,7 +315,7 @@ _TFProfRoot (0us/2.29sec)
   ...
 ```
 
-7) Show the number of float operations in the model.
+8) Show the number of float operations in the model.
 Note: float operations calculation depends on
 1) op.RegisterStatistics. If an op doesn’t
 have RegisterStatistics defined, its float operations cannot be counted.
@@ -263,7 +340,7 @@ _TFProfRoot (0/17.63b flops)
   ...
 ```
 
-8) Show the number of parameters of all `tf.trainable_variables()` in the model.
+9) Show the number of parameters of all `tf.trainable_variables()` in the model.
 
 ```shell
 # Requires --graph_path --op_log_path.
@@ -283,7 +360,7 @@ generated by write_op_log() Python API. write_op_log() help users create some
 common op types implicitly. Users can define their own op types and log it
 through the write_op_log() API.
 
-9) What if I’m lazy and don’t want to define op type? I have given my ops
+109) What if I’m lazy and don’t want to define op type? I have given my ops
 well-defined names in my model’s code. And want to use names to select a group
 of ops. Let’s try it!
 
@@ -301,7 +378,7 @@ in terminal. Otherwise, tfprof accounts all ops matched by
 `-account_type_regexes` recursively even if they are hidden due to some
 options such as -max_depth.
 
-10) TensorFlow has built-in op types. For example, built-in op type `Variable`
+11) TensorFlow has built-in op types. For example, built-in op type `Variable`
 seems to include `Variable's` created by your model. However, be careful when
 depending on it because TensorFlow creates extra `Variable` ops implicitly and
 the implicitly created ops can have the same prefix as the `Variable's` you
@@ -327,7 +404,7 @@ _TFProfRoot (--/930.58k params)
 ```
 
 
-11) A example of defining extra op type for ops using `OpLog`
+12) A example of defining extra op type for ops using `OpLog`
 
 First, in Python code, create an `OpLog` proto and add op type
 information to it:
@@ -375,10 +452,10 @@ the tool adds all `Variables` inside `tf.trainable_variables()` to
 12) Run tfprof in one-shot mode and dump result to file.
 
 ```shell
-# Printed to stdout if --dump_to_file is not set.
+# By default output to stdout. Use -output option to change output types.
 tfprof scope --graph_path=graph.pbtxt  \
              --max_depth=3 \
-             --dump_to_file="/tmp/dump"
+             --output="file:outfile=/tmp/dump"
 Reading Files...
 Parsing GraphDef...
 Preparing Views...
@@ -490,4 +567,9 @@ as long as they match the `-account_xxx` options.
 
 `-select`: Comma-separated list of metrics to show: [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types].
 
-`-dump_to_file`: Dump the output to a file, instead of terminal.
+`-output`: Output results as stdout, file or timeline.
+The format is ```output_type:key=value,key=value```.
+For example: ```timeline:outfile=<filename>```.
+timeline: key=outfile, value=<filename>.
+stdout: none.
+file: key=outfile, value=<filename>.
diff --git a/tensorflow/tools/tfprof/g3doc/code_timeline.png b/tensorflow/tools/tfprof/g3doc/code_timeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..c5ab246f7da14c0384a5704aa8053a97540a9dab
Binary files /dev/null and b/tensorflow/tools/tfprof/g3doc/code_timeline.png differ
diff --git a/tensorflow/tools/tfprof/g3doc/graph_timeline.png b/tensorflow/tools/tfprof/g3doc/graph_timeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..255a91fd5ff6005086f7d4a1dfdd43cc8d115ee1
Binary files /dev/null and b/tensorflow/tools/tfprof/g3doc/graph_timeline.png differ
diff --git a/tensorflow/tools/tfprof/g3doc/scope_timeline.png b/tensorflow/tools/tfprof/g3doc/scope_timeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..c6d95af84aac473e68b30fc6fbefa99d4431948f
Binary files /dev/null and b/tensorflow/tools/tfprof/g3doc/scope_timeline.png differ
diff --git a/tensorflow/tools/tfprof/internal/BUILD b/tensorflow/tools/tfprof/internal/BUILD
index c5482a977694d4bc1116590ab3c73c4d2562c085..e90f0ec40a07548b4879205972cc679c60c372a9 100644
--- a/tensorflow/tools/tfprof/internal/BUILD
+++ b/tensorflow/tools/tfprof/internal/BUILD
@@ -15,11 +15,13 @@ cc_library(
     srcs = ["tfprof_stats.cc"],
     hdrs = ["tfprof_stats.h"],
     deps = [
+        ":tfprof_code",
         ":tfprof_graph",
         ":tfprof_node",
         ":tfprof_options",
         ":tfprof_scope",
         ":tfprof_show",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -29,6 +31,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tfprof_timeline",
+    srcs = ["tfprof_timeline.cc"],
+    hdrs = ["tfprof_timeline.h"],
+    deps = [
+        ":tfprof_node_show",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
 cc_library(
     name = "tfprof_node",
     srcs = ["tfprof_node.cc"],
@@ -61,6 +77,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tfprof_code",
+    srcs = ["tfprof_code.cc"],
+    hdrs = ["tfprof_code.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_show_code",
+        ":tfprof_timeline",
+        ":tfprof_utils",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tfprof_graph",
     srcs = ["tfprof_graph.cc"],
@@ -71,6 +108,7 @@ cc_library(
         ":tfprof_options",
         ":tfprof_show",
         ":tfprof_tensor",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -80,6 +118,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tfprof_node_show",
+    srcs = ["tfprof_node_show.cc"],
+    hdrs = ["tfprof_node_show.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tfprof_show",
     srcs = ["tfprof_show.cc"],
@@ -87,8 +140,32 @@ cc_library(
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
+        ":tfprof_node_show",
         ":tfprof_options",
         ":tfprof_tensor",
+        ":tfprof_timeline",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tfprof_show_code",
+    srcs = ["tfprof_show_code.cc"],
+    hdrs = ["tfprof_show_code.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_node_show",
+        ":tfprof_options",
+        ":tfprof_scope",
+        ":tfprof_show",
+        ":tfprof_tensor",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -124,6 +201,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "tfprof_timeline_test",
+    srcs = ["tfprof_timeline_test.cc"],
+    data = [
+        "testdata/graph.pbtxt",
+        "testdata/run_meta",
+    ],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_options",
+        ":tfprof_stats",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tfprof_utils",
     srcs = ["tfprof_utils.cc"],
diff --git a/tensorflow/tools/tfprof/internal/print_model_analysis.cc b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
index dfe4019fbb403aaf8c66e56475cffe4ff3ab2b98..f73675e8a7322a3ef3971a1785bfea980d15a54a 100644
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.cc
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
@@ -40,13 +40,13 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
   graph_ptr->ParseFromString(*graph);
 
   std::unique_ptr<RunMetadata> run_meta_ptr;
-  if (run_meta) {
+  if (run_meta && !run_meta->empty()) {
     run_meta_ptr.reset(new RunMetadata());
     run_meta_ptr->ParseFromString(*run_meta);
   }
 
   std::unique_ptr<OpLog> op_log_ptr;
-  if (op_log) {
+  if (op_log && !op_log->empty()) {
     op_log_ptr.reset(new OpLog());
     op_log_ptr->ParseFromString(*op_log);
   }
@@ -56,18 +56,32 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
   TFStats tf_stats(std::move(graph_ptr), std::move(run_meta_ptr),
                    std::move(op_log_ptr), std::move(ckpt_reader));
 
-  Options opts = Options::FromProtoStr(*options);
+  Options opts;
+  tensorflow::Status s = Options::FromProtoStr(*options, &opts);
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.ToString().c_str());
+    return "";
+  }
 
-  if (opts.dump_to_file.empty()) {
+  if (opts.output_type == kOutput[1]) {
     printf("\n=========================Options=============================\n");
     printf("%s", opts.ToString().c_str());
     printf("\n==================Model Analysis Report======================\n");
-    TFProfNode root(tf_stats.PrintGraph(*command, opts));
+    string ret = "";
+    if (*command == kCmds[2]) {
+      ret = tf_stats.PrintCode(opts).SerializeAsString();
+    } else {
+      ret = tf_stats.PrintGraph(*command, opts).SerializeAsString();
+    }
     printf("\n======================End of Report==========================\n");
     fflush(stdout);
-    return root.SerializeAsString();
+    return ret;
+  }
+  if (*command == kCmds[2]) {
+    return tf_stats.PrintCode(opts).SerializeAsString();
+  } else {
+    return tf_stats.PrintGraph(*command, opts).SerializeAsString();
   }
-  return tf_stats.PrintGraph(*command, opts).SerializeAsString();
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.cc b/tensorflow/tools/tfprof/internal/tfprof_code.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9739db1e0b5ef4c7f13cc8fb15b635c9f81b3a70
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_code.cc
@@ -0,0 +1,224 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_code.h"
+
+#include <stdio.h>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+
+namespace tensorflow {
+namespace tfprof {
+namespace {
+// Convert to Trace proto into a short readable string.
+string GetTraceString(const CodeDef::Trace& trace) {
+  string ntrace = "";
+  if (trace.file().find_last_of('/') != trace.file().npos) {
+    ntrace += trace.file().substr(trace.file().find_last_of('/') + 1);
+  } else {
+    ntrace += trace.file();
+  }
+  ntrace += strings::StrCat(":", trace.lineno());
+  if (trace.function().length() < 20) {
+    ntrace += ":" + trace.function();
+  } else {
+    ntrace += ":" + trace.function().substr(0, 17) + "...";
+  }
+  if (trace.line().length() < 20) {
+    ntrace += ":" + trace.line();
+  } else {
+    ntrace += ":" + trace.line().substr(0, 17) + "...";
+  }
+  return ntrace;
+}
+}  // namespace
+
+void TFCode::AddNode(TFGraphNode* node) {
+  if (!node->code()) {
+    return;
+  }
+  TFCodeNode* pre_trace_node = nullptr;
+  for (int i = 0; i < node->code()->traces_size(); ++i) {
+    // Unlike op name, which is globally unique, trace name is only unique
+    // w.r.t. it's parent.
+    const string& trace = GetTraceString(node->code()->traces(i));
+    if (i == 0) {
+      if (!trace_root_) {
+        trace_root_.reset(new TFCodeNode(trace));
+      }
+      CHECK(trace_root_->name() == trace) << "Different trace root";
+      pre_trace_node = trace_root_.get();
+      continue;
+    }
+    pre_trace_node->AddChildren(trace);
+    TFCodeNode* trace_node = pre_trace_node->children()[trace].get();
+
+    if (i == node->code()->traces_size() - 1) {
+      trace_node->AddGraphNode(node);
+    }
+    pre_trace_node = trace_node;
+  }
+}
+
+void TFCode::Build() {
+  if (!trace_root_) {
+    return;
+  }
+  code_root_ = BuildCodeNodes(trace_root_.get());
+}
+
+CodeNode* TFCode::BuildCodeNodes(TFCodeNode* root) {
+  auto code_root = std::unique_ptr<CodeNode>(new CodeNode(root));
+  CodeNode* code_root_ptr = code_root.get();
+  code_nodes_.insert(std::move(code_root));
+
+  for (auto it = root->children().cbegin(); it != root->children().cend();
+       ++it) {
+    code_root_ptr->children.push_back(BuildCodeNodes(it->second.get()));
+  }
+  return code_root_ptr;
+}
+
+const ShowCodeNode* TFCode::ShowInternal(const Options& opts,
+                                         Timeline* timeline) {
+  // Search from roots recursively to find start node, if start_name_regexes
+  // is specified.
+  tfprof_trace_root_.reset(new TFCodeNode(kTFProfRoot));
+  tfprof_code_root_.reset(new CodeNode(tfprof_trace_root_.get()));
+  if (!code_root_) {
+    return tfprof_code_root_.get();
+  }
+
+  std::vector<CodeNode*> roots = {code_root_};
+  if (opts.start_name_regexes.size() != 1 ||
+      opts.start_name_regexes[0] != ".*") {
+    roots = SearchRoot(roots, opts.start_name_regexes);
+  }
+
+  tfprof_code_root_->children.assign(roots.begin(), roots.end());
+  Account({tfprof_code_root_.get()}, opts);
+
+  CodeNode* root = PrintScope({tfprof_code_root_.get()}, opts, 1, 0)[0];
+  if (timeline) {
+    timeline->GenerateCodeTimeline(root);
+  }
+  return root;
+}
+
+std::vector<CodeNode*> TFCode::SearchRoot(std::vector<CodeNode*> roots,
+                                          const std::vector<string>& regexes) {
+  std::vector<CodeNode*> res;
+  if (roots.empty()) {
+    return res;
+  }
+  for (CodeNode* root : roots) {
+    bool match_start_node = false;
+    for (const string& regex : regexes) {
+      if (RE2::FullMatch(root->name(), regex)) {
+        res.push_back(root);
+        match_start_node = true;
+        break;
+      }
+    }
+    if (match_start_node) {
+      // Found a start node at this branch, no need to continue.
+      continue;
+    }
+    std::vector<CodeNode*> nroots = SearchRoot(root->children, regexes);
+    res.insert(res.end(), nroots.begin(), nroots.end());
+  }
+  return res;
+}
+
+std::vector<CodeNode*> TFCode::PrintScope(const std::vector<CodeNode*> roots,
+                                          const Options& opts, int depth,
+                                          int last_ident) {
+  std::vector<CodeNode*> show_nodes;
+
+  for (CodeNode* node : roots) {
+    int nlast_ident = last_ident;
+    bool show = ShouldShow(node, opts, depth);
+    if (show) {
+      node->formatted_str.clear();
+      if (opts.account_displayed_op_only) {
+        node->ResetTotalStats();
+        node->AddSelfToTotalStats();
+      }
+      nlast_ident += 2;
+    }
+
+    std::vector<CodeNode*> show_cnodes;
+    if (!ShouldTrim(node, opts.trim_name_regexes)) {
+      show_cnodes = PrintScope(node->children, opts, depth + 1, nlast_ident);
+    }
+    if (show) {
+      show_cnodes = SortNodes(show_cnodes, opts);
+      string children_str;
+      for (CodeNode* sc : show_cnodes) {
+        if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+          children_str += sc->formatted_str;
+          sc->formatted_str.clear();
+        }
+        node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        sc->mutable_proto()->mutable_children()->Clear();
+        node->show_children.push_back(sc);
+        if (opts.account_displayed_op_only) {
+          node->AggregateTotalStats(sc);
+        }
+      }
+
+      node->formatted_str =
+          strings::Printf("%s%s\n", string(last_ident, ' ').c_str(),
+                          node->Format(opts).c_str());
+
+      if (opts.select.find(kShown[5]) != opts.select.end()) {
+        fprintf(stderr, "code view has no tensor value to show\n");
+      }
+
+      node->formatted_str += children_str;
+      show_nodes.push_back(node);
+    } else {
+      show_nodes.insert(show_nodes.end(), show_cnodes.begin(),
+                        show_cnodes.end());
+    }
+  }
+  return show_nodes;
+}
+
+void TFCode::Account(const std::vector<CodeNode*>& roots, const Options& opts) {
+  if (roots.empty()) return;
+
+  for (CodeNode* node : roots) {
+    node->ResetTotalStats();
+    Account(node->children, opts);
+
+    node->account = ShouldAccount(node, opts);
+    if (node->account) {
+      node->AddSelfToTotalStats();
+    }
+    for (CodeNode* c : node->children) {
+      node->AggregateTotalStats(c);
+    }
+  }
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.h b/tensorflow/tools/tfprof/internal/tfprof_code.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7a28624f1e1f7139d68f5c153f946c405b5ed9a
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_code.h
@@ -0,0 +1,74 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a tree structure based on the TensorFlow model's python code stacks.
+// Stats are aggregated from descendants from ancestors.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_show_code.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFCode : public TFShowCode {
+ public:
+  explicit TFCode() : code_root_(nullptr), trace_root_(nullptr) {}
+  ~TFCode() override {}
+
+  void AddNode(TFGraphNode* node) override;
+
+  void Build() override;
+
+ private:
+  CodeNode* BuildCodeNodes(TFCodeNode* root);
+
+  const ShowCodeNode* ShowInternal(const Options& opts,
+                                   Timeline* timeline) override;
+
+  std::vector<CodeNode*> SearchRoot(std::vector<CodeNode*> roots,
+                                    const std::vector<string>& regexes);
+
+  std::vector<CodeNode*> PrintScope(const std::vector<CodeNode*> roots,
+                                    const Options& opts, int depth,
+                                    int last_ident);
+
+  void Account(const std::vector<CodeNode*>& roots, const Options& opts);
+
+  CodeNode* code_root_;
+  std::unique_ptr<TFCodeNode> trace_root_;
+  std::unique_ptr<TFCodeNode> tfprof_trace_root_;
+  std::unique_ptr<CodeNode> tfprof_code_root_;
+  std::set<std::unique_ptr<CodeNode>> code_nodes_;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.cc b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
index 469b258f98b50cfdddba17232b69c2e09c27c443..23084146c2c465dfb6fcd4d8b7ac51e68d472fb5 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
@@ -31,14 +31,14 @@ GraphNode* TFGraph::CreateParentNode(const string& name) {
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFGraphParent);
   parent_nodes_[name] =
-      std::unique_ptr<TFNode>(new TFNode(node_defs_.back().get()));
+      std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
   nodes_map_[name] =
       std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
 }
 
-void TFGraph::AddNode(TFNode* node) {
-  string name = node->node_def()->name();
+void TFGraph::AddNode(TFGraphNode* node) {
+  string name = node->name();
   nodes_map_[name] = std::unique_ptr<GraphNode>(new GraphNode(node));
 }
 
@@ -49,7 +49,7 @@ void TFGraph::Build() {
   // Filter out the root nodes (node not input of any other node).
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     GraphNode* node = it->second.get();
-    const std::map<string, TFNode*>& inputs = node->node->inputs();
+    const std::map<string, TFGraphNode*>& inputs = node->node->inputs();
     for (auto inputs_it = inputs.cbegin(); inputs_it != inputs.cend();
          inputs_it++) {
       nonroots.insert(inputs_it->first);
@@ -66,7 +66,7 @@ void TFGraph::Build() {
   }
 }
 
-const ShowNode* TFGraph::ShowInternal(const Options& opts) {
+const ShowNode* TFGraph::ShowInternal(const Options& opts, Timeline* timeline) {
   // Search the nodes to start from.
   std::vector<GraphNode*> roots = roots_;
   if (opts.start_name_regexes.size() != 1 ||
@@ -81,11 +81,13 @@ const ShowNode* TFGraph::ShowInternal(const Options& opts) {
   std::map<string, int64> account_visits;
   Account({root}, opts, &account_visits);
 
-  if (opts.viz) {
-    printf("Visualizing feature disabled...\n");
-  }
   std::set<string> visits;
-  return PrintGraph({root}, opts, 1, 0, 0, &visits)[0];
+  root = PrintGraph({root}, opts, 1, 0, 0, &visits)[0];
+
+  if (timeline) {
+    timeline->GenerateGraphTimeline(root);
+  }
+  return root;
 }
 
 std::vector<GraphNode*> TFGraph::SearchRoot(
@@ -155,8 +157,14 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
       show_cnodes = SortNodes(show_cnodes, opts);
       string children_str;
       for (GraphNode* sc : show_cnodes) {
-        children_str += sc->formatted_str;
-        node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+          children_str += sc->formatted_str;
+          sc->formatted_str.clear();
+        }
+        // This swap and reinit pattern is critical for performance.
+        node->mutable_proto()->add_children()->Swap(sc->mutable_proto());
+        sc->ReInit();
+        node->show_children.push_back(sc);
         if (opts.account_displayed_op_only) {
           node->AggregateTotalStats(sc);
         }
@@ -181,7 +189,6 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
           node->formatted_str += value_str;
         }
       }
-
       node->formatted_str += children_str;
       show_nodes.push_back(node);
     } else {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.h b/tensorflow/tools/tfprof/internal/tfprof_graph.h
index b16f80b33db44d124591898d1983ed3fb5a48e56..4d4aa8b2b1d9445b9502b1103c5c257052a48e7f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.h
@@ -37,32 +37,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
-class GraphNode : public ShowNode {
- public:
-  explicit GraphNode(TFNode* node) : ShowNode(node) {
-    mutable_proto()->set_inputs(node->inputs().size());
-    mutable_proto()->set_total_inputs(0);
-  }
-
-  void AggregateTotalStats(GraphNode* node) {
-    ShowNode::AggregateTotalStats(node);
-    mutable_proto()->set_total_inputs(proto().total_inputs() +
-                                      node->proto().total_inputs() + 1);
-  }
-
-  void AddSelfToTotalStats() {
-    ShowNode::AddSelfToTotalStats();
-    mutable_proto()->set_total_inputs(proto().total_inputs() +
-                                      proto().inputs());
-  }
-
-  void ResetTotalStats() {
-    ShowNode::ResetTotalStats();
-    mutable_proto()->set_total_inputs(0);
-  }
-
-  std::vector<GraphNode*> children;
-};
 
 // Organize tensorflow ops in a graph structure, pointing from output ops
 // to input ops.
@@ -72,12 +46,13 @@ class TFGraph : public TFShow {
       : TFShow(ckpt_reader) {}
   ~TFGraph() override {}
 
-  void AddNode(TFNode* node) override;
+  void AddNode(TFGraphNode* node) override;
 
   void Build() override;
 
  private:
-  const ShowNode* ShowInternal(const Options& opts) override;
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
 
   bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
                          int depth) override {
@@ -99,14 +74,14 @@ class TFGraph : public TFShow {
   std::vector<GraphNode*> GenerateGraphDot(
       GraphNode* root, GraphNode* last_shown, const Options& opts, int depth,
       int hidden, std::set<string>* declared_nodes,
-      std::set<string>* declared_edges, TFProfNode* parent);
+      std::set<string>* declared_edges, TFGraphNodeProto* parent);
 
   void Account(const std::vector<GraphNode*>& roots, const Options& opts,
                std::map<string, int64>* visits);
 
   std::vector<GraphNode*> roots_;
   std::vector<std::unique_ptr<NodeDef>> node_defs_;
-  std::map<string, std::unique_ptr<TFNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<TFGraphNode>> parent_nodes_;
   std::map<string, std::unique_ptr<GraphNode>> nodes_map_;
 };
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.cc b/tensorflow/tools/tfprof/internal/tfprof_node.cc
index 08bd91d99c66ce2c5e17024edf225fafc3f9204d..74c8fcbe4816561805cd085fc190a0d709a6a7fc 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.cc
@@ -20,19 +20,22 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
-void TFNode::AddStepStat(const string& device, const NodeExecStats* step_stat) {
-  if (!device.empty()) {
-    // This might override device from GraphDef.
-    device_ = device;
-  }
+// Notes about start and end time from the NodeExecStats proto.
+// For GPU, there is no difference between op_end_rel_micros and
+// all_end_rel_micros. All are kernel times.
+// For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
+// some post-processing.
+// Here, we only consider kernel time for simplicity.
+void TFGraphNode::AddStepStat(const string& device,
+                              const NodeExecStats* step_stat) {
   step_stat_ = step_stat;
+  CHECK(step_stat_);
 
-  op_start_micros_ = step_stat_->all_start_micros();
-  if (step_stat_->op_end_rel_micros() && step_stat_->op_start_rel_micros()) {
-    op_schedule_micros_ =
-        step_stat_->op_end_rel_micros() - step_stat_->op_start_rel_micros();
-  }
-  all_spent_micros_ = step_stat_->all_end_rel_micros();
+  string dev = str_util::Lowercase(device);
+
+  devices_.insert(dev);
+  op_kernel_execs_[dev].push_back(std::make_pair(
+      step_stat_->all_start_micros(), step_stat_->op_end_rel_micros()));
 
   for (const auto& output : step_stat_->output()) {
     if (output.has_tensor_description() &&
@@ -43,9 +46,5 @@ void TFNode::AddStepStat(const string& device, const NodeExecStats* step_stat) {
     }
   }
 }
-
-void TFNode::AddNodeStat(const CostGraphDef::Node* cost_node) {
-  kernel_compute_micros_ = cost_node->compute_cost();
-}
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.h b/tensorflow/tools/tfprof/internal/tfprof_node.h
index 677c8d3c870005a95b56701ec8366d1e53cbdbb5..8e57db7ba2cc103ba0c769d122499cbbf98a4c18 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.h
@@ -23,26 +23,24 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 
-class TFNode {
+class TFGraphNode {
  public:
-  TFNode(const NodeDef* node)
+  TFGraphNode(const NodeDef* node)
       : node_(node),
+        code_(nullptr),
         step_stat_(nullptr),
-        op_start_micros_(0),
-        op_schedule_micros_(0),
-        kernel_compute_micros_(0),
-        all_spent_micros_(0),
         requested_bytes_(0),
         float_ops_(0) {
     if (!node) return;
@@ -67,56 +65,140 @@ class TFNode {
       update_shape(shape_vec);
     }
     op_types_.insert(node->op());
-    device_ = node->device();
   }
 
-  TFNode() : TFNode(nullptr) {}
+  TFGraphNode() : TFGraphNode(nullptr) {}
 
-  void AddInput(TFNode* input) { inputs_[input->node_def()->name()] = input; }
+  void AddInput(TFGraphNode* input, int64 output_idx) {
+    inputs_[input->name()] = input;
+    output_idx_[input->name()] = output_idx;
+  }
 
   void AddOpType(const string& op_type) { op_types_.insert(op_type); }
 
   void AddStepStat(const string& device, const NodeExecStats* step_stat);
 
-  // Add CostGraphDef::Node.
-  void AddNodeStat(const CostGraphDef::Node* cost_node);
-
   void AddFloatOps(int64 float_ops) { float_ops_ = float_ops; }
 
+  void AddCode(const CodeDef* code) { code_ = code; }
+
+  const string& name() const { return node_->name(); }
   const NodeDef* node_def() { return node_; }
-  const std::map<string, TFNode*>& inputs() { return inputs_; }
-  int64 op_start_micros() { return op_start_micros_; }
-  // This is time spent in Op::Compute(), which is GPU kernel schedule time.
-  // Currently not used.
-  int64 op_schedule_micros() { return op_schedule_micros_; }
+
+  const NodeExecStats* step_stats() const { return step_stat_; }
+
+  const std::map<string, TFGraphNode*>& inputs() const { return inputs_; }
+  const std::map<string, int64>& output_idx() { return output_idx_; }
+
   // This is time spent in kernel execution.
-  int64 kernel_compute_micros() { return kernel_compute_micros_; }
-  int64 all_spent_micros() { return all_spent_micros_; }
-  int64 requested_byptes() { return requested_bytes_; }
-  int64 float_ops() { return float_ops_; }
-  string device() { return device_; }
-  const std::set<string>& op_types() { return op_types_; }
+  int64 kernel_exec_micros() const {
+    if (!step_stat_) return 0;
+    int64 total = 0;
+    for (const auto& execs : op_kernel_execs_) {
+      for (const auto& exec : execs.second) {
+        total += exec.second;
+      }
+    }
+    return total;
+  }
+  const std::map<string, std::vector<std::pair<int64, int64>>>&
+  op_kernel_execs() const {
+    return op_kernel_execs_;
+  }
 
-  const std::vector<int64>& shape() { return shape_; }
+  int64 requested_bytes() const { return requested_bytes_; }
+  int64 float_ops() const { return float_ops_; }
+  const CodeDef* code() { return code_; }
+  std::set<string> devices() const { return devices_; }
+  const std::set<string>& op_types() const { return op_types_; }
+
+  const std::vector<int64>& shape() const { return shape_; }
 
  private:
   void update_shape(const std::vector<int64>& shape) { shape_ = shape; }
 
-  std::map<string, TFNode*> inputs_;
+  std::map<string, TFGraphNode*> inputs_;
+  std::map<string, int64> output_idx_;
+
   const NodeDef* node_;
+  const CodeDef* code_;
   const NodeExecStats* step_stat_;
 
   std::vector<int64> shape_;
   std::set<string> op_types_;
-  string device_;
-  int64 op_start_micros_;
-  int64 op_schedule_micros_;
-  int64 kernel_compute_micros_;
-  int64 all_spent_micros_;
+
+  // device -> vector of {op_start_micros, op_kernel_exec_micros} pairs.
+  std::map<string, std::vector<std::pair<int64, int64>>> op_kernel_execs_;
+
+  std::set<string> devices_;
   int64 requested_bytes_;
   int64 float_ops_;
 };
 
+class TFCodeNode {
+ public:
+  TFCodeNode(const string& trace)
+      : trace_(trace),
+        kernel_exec_micros_(0),
+        requested_bytes_(0),
+        float_ops_(0) {}
+
+  void AddGraphNode(const TFGraphNode* node) {
+    if (nodes_.find(node->name()) != nodes_.end()) {
+      return;
+    }
+    nodes_[node->name()] = node;
+
+    kernel_exec_micros_ += node->kernel_exec_micros();
+    requested_bytes_ += node->requested_bytes();
+    float_ops_ += node->float_ops();
+    op_types_.insert(node->op_types().begin(), node->op_types().end());
+    if (node->shape().size() > 0) {
+      shapes_.push_back(node->shape());
+    }
+    std::set<string> devices = node->devices();
+    devices_.insert(devices.begin(), devices.end());
+  }
+  const std::map<string, const TFGraphNode*>& graph_nodes() const {
+    return nodes_;
+  }
+
+  void AddChildren(const string& trace) {
+    if (children_.find(trace) != children_.end()) {
+      return;
+    }
+    children_[trace].reset(new TFCodeNode(trace));
+  }
+  std::map<string, std::unique_ptr<TFCodeNode>>& children() {
+    return children_;
+  }
+
+  const string& name() const { return trace_; }
+
+  int64 kernel_exec_micros() const { return kernel_exec_micros_; }
+
+  int64 requested_bytes() const { return requested_bytes_; }
+
+  int64 float_ops() const { return float_ops_; }
+
+  const std::set<string>& devices() const { return devices_; }
+
+  const std::set<string>& op_types() const { return op_types_; }
+
+  const std::vector<std::vector<int64>>& shapes() const { return shapes_; }
+
+ private:
+  const string trace_;
+  std::set<string> op_types_;
+  int64 kernel_exec_micros_;
+  int64 requested_bytes_;
+  int64 float_ops_;
+
+  std::set<string> devices_;
+  std::vector<std::vector<int64>> shapes_;
+  std::map<string, const TFGraphNode*> nodes_;
+  std::map<string, std::unique_ptr<TFCodeNode>> children_;
+};
 }  // namespace tfprof
 }  // namespace tensorflow
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.cc b/tensorflow/tools/tfprof/internal/tfprof_node_show.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b5390676dd0d3ee06c0ff7e1dfa01079a118235
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_node_show.cc
@@ -0,0 +1,296 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace tfprof {
+ShowNode::ShowNode(const TFGraphNode* node) : node(node), account(true) {
+  ReInit();
+}
+
+void ShowNode::ReInit() {
+  mutable_proto()->set_name(name());
+  for (const string& device : node->devices()) {
+    *mutable_proto()->mutable_devices()->Add() = device;
+  }
+  mutable_proto()->set_exec_micros(node->kernel_exec_micros());
+  mutable_proto()->set_requested_bytes(node->requested_bytes());
+  mutable_proto()->set_float_ops(node->float_ops());
+
+  if (!node->shape().empty()) {
+    int64 params = 1;
+    bool complete_shape = true;
+    for (int64 d : node->shape()) {
+      // Sometimes parameters could be <0 when a dim is unknown.
+      if (d < 0) {
+        complete_shape = false;
+        break;
+      }
+      params *= d;
+    }
+    if (complete_shape) {
+      mutable_proto()->set_parameters(proto_.parameters() + params);
+    } else {
+      fprintf(stderr, "Incomplete shape.");
+    }
+  }
+}
+
+string ShowNode::Format(const Options& opts) {
+  if (opts.select.empty()) {
+    return name();
+  }
+  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
+}
+
+string ShowNode::FormatMeta(const Options& opts) {
+  std::vector<string> info;
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    const string shape = FormatShapes(node->shape());
+    if (!shape.empty()) {
+      info.push_back(shape);
+    }
+    string params = FormatNumber(proto().total_parameters()) + " params";
+    if (account) {
+      params = FormatNumber(proto().parameters()) + "/" + params;
+    } else {
+      params = "--/" + params;
+    }
+    info.push_back(params);
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    string fops = FormatNumber(proto().total_float_ops()) + " flops";
+    if (account) {
+      fops = FormatNumber(proto().float_ops()) + "/" + fops;
+    } else {
+      fops = "--/" + fops;
+    }
+    info.push_back(fops);
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(proto().total_requested_bytes());
+    if (account) {
+      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
+
+    } else {
+      memory = "--/" + memory;
+    }
+    info.push_back(memory);
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    string time = FormatTime(proto().total_exec_micros());
+    if (account) {
+      time = FormatTime(proto().exec_micros()) + "/" + time;
+    } else {
+      time = "--/" + time;
+    }
+    info.push_back(time);
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    if (proto().devices_size() > 0) {
+      info.push_back(str_util::Join(proto().devices(), "|"));
+    }
+  }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    std::set<string> op_types = node->op_types();
+    // Device is considered a type.
+    if (proto().devices_size() > 0) {
+      op_types.insert(str_util::Join(proto().devices(), "|"));
+    }
+    info.push_back(str_util::Join(op_types, "|"));
+  }
+  return str_util::Join(info, ", ");
+}
+
+TFGraphNodeProto* ShowNode::mutable_proto() { return &proto_; }
+
+const TFGraphNodeProto& ShowNode::proto() const { return proto_; }
+
+void ShowNode::AggregateTotalStats(ShowNode* node) {
+  TFGraphNodeProto* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         node_pb->total_exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             node_pb->total_requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        node_pb->total_parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       node_pb->total_float_ops());
+}
+
+void ShowNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         proto().exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             proto().requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        proto().parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       proto().float_ops());
+}
+
+void ShowNode::ResetTotalStats() {
+  mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_requested_bytes(0);
+  mutable_proto()->set_total_parameters(0);
+  mutable_proto()->set_total_float_ops(0);
+  mutable_proto()->mutable_children()->Clear();
+}
+
+ShowCodeNode::ShowCodeNode(const TFCodeNode* node) : node(node), account(true) {
+  std::vector<ScopeNode> snodes;
+  for (auto it : node->graph_nodes()) {
+    ScopeNode snode(it.second);
+    snodes.push_back(snode);
+    snodes[snodes.size() - 1].AddSelfToTotalStats();
+    *mutable_proto()->mutable_graph_nodes()->Add() =
+        snodes[snodes.size() - 1].proto();
+  }
+
+  mutable_proto()->set_name(name());
+  mutable_proto()->set_exec_micros(node->kernel_exec_micros());
+  mutable_proto()->set_requested_bytes(node->requested_bytes());
+  mutable_proto()->set_float_ops(node->float_ops());
+
+  if (!node->shapes().empty()) {
+    for (const std::vector<int64>& shape : node->shapes()) {
+      int64 params = 1;
+      bool complete_shape = true;
+      for (int64 d : shape) {
+        // Sometimes parameters could be <0 when a dim is unknown.
+        if (d < 0) {
+          complete_shape = false;
+          break;
+        }
+        params *= d;
+      }
+      if (complete_shape) {
+        mutable_proto()->set_parameters(proto().parameters() + params);
+      } else {
+        fprintf(stderr, "Incomplete shape.");
+      }
+    }
+  }
+}
+
+string ShowCodeNode::Format(const Options& opts) {
+  if (opts.select.empty()) {
+    return name();
+  }
+  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
+}
+
+string ShowCodeNode::FormatMeta(const Options& opts) {
+  std::vector<string> info;
+  std::vector<string> shapes;
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    for (const std::vector<int64>& shape : node->shapes()) {
+      if (!shape.empty()) {
+        shapes.push_back(FormatShapes(shape));
+      }
+    }
+    if (!shapes.empty()) {
+      info.push_back(str_util::Join(shapes, "|"));
+    }
+    string params = FormatNumber(proto().total_parameters()) + " params";
+    if (account) {
+      params = FormatNumber(proto().parameters()) + "/" + params;
+    } else {
+      params = "--/" + params;
+    }
+    info.push_back(params);
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    string fops = FormatNumber(proto().total_float_ops()) + " flops";
+    if (account) {
+      fops = FormatNumber(proto().float_ops()) + "/" + fops;
+    } else {
+      fops = "--/" + fops;
+    }
+    info.push_back(fops);
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(proto().total_requested_bytes());
+    if (account) {
+      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
+
+    } else {
+      memory = "--/" + memory;
+    }
+    info.push_back(memory);
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    string time = FormatTime(proto().total_exec_micros());
+    if (account) {
+      time = FormatTime(proto().exec_micros()) + "/" + time;
+    } else {
+      time = "--/" + time;
+    }
+    info.push_back(time);
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    if (!node->devices().empty()) {
+      info.push_back(str_util::Join(node->devices(), "|"));
+    }
+  }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    std::set<string> op_types = node->op_types();
+    // Device is considered a type.
+    op_types.insert(node->devices().cbegin(), node->devices().cend());
+    info.push_back(str_util::Join(op_types, "|"));
+  }
+  return str_util::Join(info, ", ");
+}
+
+TFCodeNodeProto* ShowCodeNode::mutable_proto() { return &proto_; }
+
+const TFCodeNodeProto& ShowCodeNode::proto() const { return proto_; }
+
+void ShowCodeNode::AggregateTotalStats(ShowCodeNode* node) {
+  TFCodeNodeProto* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         node_pb->total_exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             node_pb->total_requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        node_pb->total_parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       node_pb->total_float_ops());
+}
+
+void ShowCodeNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         proto().exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             proto().requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        proto().parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       proto().float_ops());
+}
+
+void ShowCodeNode::ResetTotalStats() {
+  mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_requested_bytes(0);
+  mutable_proto()->set_total_parameters(0);
+  mutable_proto()->set_total_float_ops(0);
+  mutable_proto()->mutable_children()->Clear();
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.h b/tensorflow/tools/tfprof/internal/tfprof_node_show.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ce0f63f9b572d3f35dbf022a688bbf3189487cb
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_node_show.h
@@ -0,0 +1,173 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Nodes used for different views.
+// ScopeNode is for scope view. GraphNode is for graph view and CodeNode
+// is for code view.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class ShowNode {
+ public:
+  explicit ShowNode(const TFGraphNode* node);
+  virtual ~ShowNode() {}
+
+  const string& name() const { return node->name(); }
+  TFGraphNodeProto* mutable_proto();
+  const TFGraphNodeProto& proto() const;
+
+  void ReInit();
+
+  string Format(const Options& opts);
+
+  string FormatMeta(const Options& opts);
+
+  const TFGraphNode* node;
+  bool account;
+  string formatted_str;
+
+ protected:
+  void AggregateTotalStats(ShowNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  TFGraphNodeProto proto_;
+};
+
+class GraphNode : public ShowNode {
+ public:
+  explicit GraphNode(TFGraphNode* node) : ShowNode(node) {
+    mutable_proto()->set_inputs(node->inputs().size());
+    mutable_proto()->set_total_inputs(0);
+  }
+
+  void ReInit() {
+    ShowNode::ReInit();
+    mutable_proto()->set_inputs(node->inputs().size());
+    mutable_proto()->set_total_inputs(0);
+  }
+
+  void AggregateTotalStats(GraphNode* node) {
+    ShowNode::AggregateTotalStats(node);
+    mutable_proto()->set_total_inputs(proto().total_inputs() +
+                                      node->proto().total_inputs() + 1);
+  }
+
+  void AddSelfToTotalStats() {
+    ShowNode::AddSelfToTotalStats();
+    mutable_proto()->set_total_inputs(proto().total_inputs() +
+                                      proto().inputs());
+  }
+
+  void ResetTotalStats() {
+    ShowNode::ResetTotalStats();
+    mutable_proto()->set_total_inputs(0);
+    show_children.clear();
+  }
+
+  std::vector<GraphNode*> children;
+  std::vector<GraphNode*> show_children;
+};
+
+class ScopeNode : public ShowNode {
+ public:
+  explicit ScopeNode(const TFGraphNode* node) : ShowNode(node) {}
+  ~ScopeNode() override {}
+
+  void ReInit() { ShowNode::ReInit(); }
+
+  void AggregateTotalStats(ScopeNode* node) {
+    ShowNode::AggregateTotalStats(node);
+  }
+
+  void AddSelfToTotalStats() { ShowNode::AddSelfToTotalStats(); }
+
+  void ResetTotalStats() {
+    ShowNode::ResetTotalStats();
+    show_children.clear();
+  }
+
+  std::vector<ScopeNode*> children;
+  std::vector<ScopeNode*> show_children;
+};
+
+class ShowCodeNode {
+ public:
+  explicit ShowCodeNode(const TFCodeNode* node);
+  virtual ~ShowCodeNode() {}
+
+  const string& name() const { return node->name(); }
+  TFCodeNodeProto* mutable_proto();
+  const TFCodeNodeProto& proto() const;
+
+  string Format(const Options& opts);
+
+  string FormatMeta(const Options& opts);
+
+  const TFCodeNode* node;
+  bool account;
+  string formatted_str;
+
+ protected:
+  void AggregateTotalStats(ShowCodeNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  TFCodeNodeProto proto_;
+};
+
+class CodeNode : public ShowCodeNode {
+ public:
+  explicit CodeNode(const TFCodeNode* node) : ShowCodeNode(node) {}
+  ~CodeNode() override {}
+
+  void AggregateTotalStats(CodeNode* node) {
+    ShowCodeNode::AggregateTotalStats(node);
+  }
+
+  void AddSelfToTotalStats() { ShowCodeNode::AddSelfToTotalStats(); }
+
+  void ResetTotalStats() {
+    ShowCodeNode::ResetTotalStats();
+    show_children.clear();
+  }
+
+  std::vector<CodeNode*> children;
+  std::vector<CodeNode*> show_children;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.cc b/tensorflow/tools/tfprof/internal/tfprof_options.cc
index 03282533ffd4518e4c44bddfd31bbbdb18e0f9ab..f592a4cf8cf7435b63b27f46e4961741d47aa1c9 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.cc
@@ -17,16 +17,133 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/tools/tfprof/tfprof_options.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
+namespace {
+string KeyValueToStr(const std::map<string, string>& kv_map) {
+  std::vector<string> kv_vec;
+  kv_vec.reserve(kv_map.size());
+  for (const auto& pair : kv_map) {
+    kv_vec.push_back(strings::StrCat(pair.first, "=", pair.second));
+  }
+  return str_util::Join(kv_vec, ",");
+}
+}  // namespace
+
+tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
+                               std::map<string, string>* output_options) {
+  // The default is to use stdout.
+  if (output_opt.empty()) {
+    *output_type = kOutput[1];
+    return tensorflow::Status::OK();
+  }
+
+  std::set<string> output_types(kOutput,
+                                kOutput + sizeof(kOutput) / sizeof(*kOutput));
+  auto opt_split = output_opt.find(":");
+  std::vector<string> kv_split;
+  if (opt_split == output_opt.npos) {
+    if (output_types.find(output_opt) == output_types.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
+                          output_opt.c_str(),
+                          str_util::Join(output_types, ",").c_str()));
+    }
+    *output_type = output_opt;
+  } else {
+    *output_type = output_opt.substr(0, opt_split);
+    if (output_types.find(*output_type) == output_types.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
+                          output_type->c_str(),
+                          str_util::Join(output_types, ",").c_str()));
+    }
+    kv_split = str_util::Split(output_opt.substr(opt_split + 1), ",",
+                               str_util::SkipEmpty());
+  }
 
-Options Options::FromProtoStr(const string& opts_proto_str) {
+  std::set<string> valid_options;
+  std::set<string> required_options;
+  if (*output_type == kOutput[0]) {
+    valid_options.insert(
+        kTimelineOpts,
+        kTimelineOpts + sizeof(kTimelineOpts) / sizeof(*kTimelineOpts));
+    required_options.insert(
+        kTimelineRequiredOpts,
+        kTimelineRequiredOpts +
+            sizeof(kTimelineRequiredOpts) / sizeof(*kTimelineRequiredOpts));
+  } else if (*output_type == kOutput[2]) {
+    valid_options.insert(kFileOpts,
+                         kFileOpts + sizeof(kFileOpts) / sizeof(*kFileOpts));
+    required_options.insert(kFileRequiredOpts,
+                            kFileRequiredOpts + sizeof(kFileRequiredOpts) /
+                                                    sizeof(*kFileRequiredOpts));
+  }
+
+  for (const string& kv_str : kv_split) {
+    const std::vector<string> kv =
+        str_util::Split(kv_str, "=", str_util::SkipEmpty());
+    if (kv.size() != 2) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          "Visualize format: -output timeline:key=value,key=value,...");
+    }
+    if (valid_options.find(kv[0]) == valid_options.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("Unrecognized options %s for output_type: %s\n",
+                          kv[0].c_str(), output_type->c_str()));
+    }
+    (*output_options)[kv[0]] = kv[1];
+  }
+
+  for (const string& opt : required_options) {
+    if (output_options->find(opt) == output_options->end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("Missing required output_options for %s\n"
+                          "E.g. -output %s:%s=...\n",
+                          output_type->c_str(), output_type->c_str(),
+                          opt.c_str()));
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status Options::FromProtoStr(const string& opts_proto_str,
+                                         Options* opts) {
   OptionsProto opts_pb;
-  CHECK(opts_pb.ParseFromString(opts_proto_str));
-  Options opts(
+  if (!opts_pb.ParseFromString(opts_proto_str)) {
+    return tensorflow::Status(
+        tensorflow::error::INTERNAL,
+        strings::StrCat("Failed to parse option string from Python API: ",
+                        opts_proto_str));
+  }
+
+  string output_type;
+  std::map<string, string> output_options;
+  tensorflow::Status s =
+      ParseOutput(opts_pb.output(), &output_type, &output_options);
+  if (!s.ok()) return s;
+
+  if (!opts_pb.dump_to_file().empty()) {
+    fprintf(stderr,
+            "-dump_to_file option is deprecated. "
+            "Please use -output file:outfile=<filename>\n");
+    fprintf(stderr, "-output %s is overwritten with -output file:outfile=%s\n",
+            opts_pb.output().c_str(), opts_pb.dump_to_file().c_str());
+    output_type = kOutput[2];
+    output_options.clear();
+    output_options[kFileOpts[0]] = opts_pb.dump_to_file();
+  }
+
+  *opts = Options(
       opts_pb.max_depth(), opts_pb.min_bytes(), opts_pb.min_micros(),
       opts_pb.min_params(), opts_pb.min_float_ops(),
       std::vector<string>(opts_pb.device_regexes().begin(),
@@ -44,8 +161,8 @@ Options Options::FromProtoStr(const string& opts_proto_str) {
                           opts_pb.hide_name_regexes().end()),
       opts_pb.account_displayed_op_only(),
       std::vector<string>(opts_pb.select().begin(), opts_pb.select().end()),
-      opts_pb.viz(), opts_pb.dump_to_file());
-  return opts;
+      output_type, output_options);
+  return tensorflow::Status::OK();
 }
 
 string Options::ToString() const {
@@ -64,8 +181,7 @@ string Options::ToString() const {
       "%-28s%s\n"
       "%-28s%s\n"
       "%-28s%s\n"
-      "%-28s%s\n"
-      "%-28s%s\n",
+      "%-28s%s:%s\n",
       kOptions[0], max_depth, kOptions[1], min_bytes, kOptions[2], min_micros,
       kOptions[3], min_params, kOptions[4], min_float_ops, kOptions[5],
       str_util::Join(device_regexes, ",").c_str(), kOptions[6],
@@ -76,8 +192,8 @@ string Options::ToString() const {
       str_util::Join(show_name_regexes, ",").c_str(), kOptions[11],
       str_util::Join(hide_name_regexes, ",").c_str(), kOptions[12],
       (account_displayed_op_only ? "true" : "false"), kOptions[13],
-      str_util::Join(select, ",").c_str(), kOptions[14],
-      (viz ? "true" : "false"), kOptions[15], dump_to_file.c_str());
+      str_util::Join(select, ",").c_str(), kOptions[14], output_type.c_str(),
+      KeyValueToStr(output_options).c_str());
   return s;
 }
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.h b/tensorflow/tools/tfprof/internal/tfprof_options.h
index a5b55e77fac0818bae927ce0e42110c0eca1c206..cf48b4de8162732c9b3e77f89d1029d4aa62ae0e 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -40,8 +41,7 @@ static const char* const kOptions[] = {
     "-hide_name_regexes",
     "-account_displayed_op_only",
     "-select",
-    "-viz",
-    "-dump_to_file",
+    "-output",
 };
 
 static const char* const kOrderBy[] = {
@@ -55,14 +55,33 @@ static const char* const kShown[] = {
 };
 
 static const char* const kCmds[] = {
-    "scope", "graph", "set", "help",
+    "scope", "graph", "code", "set", "help",
+};
+
+static const char* const kOutput[] = {"timeline", "stdout", "file"};
+
+static const char* const kTimelineOpts[] = {
+    "outfile",
+};
+
+static const char* const kTimelineRequiredOpts[] = {"outfile"};
+
+static const char* const kFileOpts[] = {
+    "outfile",
+};
+
+static const char* const kFileRequiredOpts[] = {
+    "outfile",
 };
 
 struct Options {
  public:
-  static Options FromProtoStr(const string& opts_proto_str);
+  static tensorflow::Status FromProtoStr(const string& opts_proto_str,
+                                         Options* opts);
 
   virtual ~Options() {}
+  Options()
+      : Options(0, 0, 0, 0, 0, {}, "", {}, {}, {}, {}, {}, false, {}, "", {}) {}
   Options(int max_depth, tensorflow::int64 min_bytes,
           tensorflow::int64 min_micros, tensorflow::int64 min_params,
           tensorflow::int64 min_float_ops,
@@ -73,7 +92,8 @@ struct Options {
           const std::vector<string>& show_name_regexes,
           const std::vector<string>& hide_name_regexes,
           bool account_displayed_op_only, const std::vector<string>& select,
-          bool viz, const string& dump_to_file = "")
+          const string& output_type,
+          const std::map<string, string>& output_options)
       : max_depth(max_depth),
         min_bytes(min_bytes),
         min_micros(min_micros),
@@ -88,8 +108,8 @@ struct Options {
         hide_name_regexes(hide_name_regexes),
         account_displayed_op_only(account_displayed_op_only),
         select(select.begin(), select.end()),
-        viz(viz),
-        dump_to_file(dump_to_file) {}
+        output_type(output_type),
+        output_options(output_options) {}
 
   string ToString() const;
 
@@ -109,10 +129,17 @@ struct Options {
   bool account_displayed_op_only;
 
   std::set<string> select;
-  bool viz;
-  string dump_to_file;
+
+  string output_type;
+  std::map<string, string> output_options;
 };
 
+// Parse the -output option.
+// 'output_opt': User input string with format: output_type:key=value,key=value.
+// 'output_type' and 'output_options' are extracted from 'output_opt'.
+tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
+                               std::map<string, string>* output_options);
+
 }  // namespace tfprof
 }  // namespace tensorflow
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.cc b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
index 949d2d54e42b8683e8bb365e0b23c49feeb686e3..fe525c4bd840d287538096fd6d8fe2347f813991 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
@@ -35,15 +35,15 @@ ScopeNode* TFScope::CreateParentNode(const string& name) {
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFScopeParent);
   parent_nodes_[name] =
-      std::unique_ptr<TFNode>(new TFNode(node_defs_.back().get()));
+      std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
   nodes_map_[name] =
       std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
 }
 
-void TFScope::AddNode(TFNode* node) {
-  string name = node->node_def()->name();
-  if (nodes_map_.find(node->node_def()->name()) == nodes_map_.end()) {
+void TFScope::AddNode(TFGraphNode* node) {
+  string name = node->name();
+  if (nodes_map_.find(node->name()) == nodes_map_.end()) {
     nodes_map_[name] = std::unique_ptr<ScopeNode>(new ScopeNode(node));
   }
 
@@ -72,7 +72,7 @@ void TFScope::Build() {
   }
 }
 
-const ShowNode* TFScope::ShowInternal(const Options& opts) {
+const ShowNode* TFScope::ShowInternal(const Options& opts, Timeline* timeline) {
   // Search from roots recursively to find start node, if start_name_regexes
   // is specified.
   std::vector<ScopeNode*> roots = roots_;
@@ -86,6 +86,9 @@ const ShowNode* TFScope::ShowInternal(const Options& opts) {
   Account({root}, opts);
 
   root = PrintScope({root}, opts, 1, 0)[0];
+  if (timeline) {
+    timeline->GenerateScopeTimeline(root);
+  }
   return root;
 }
 
@@ -139,8 +142,13 @@ std::vector<ScopeNode*> TFScope::PrintScope(const std::vector<ScopeNode*> roots,
       show_cnodes = SortNodes(show_cnodes, opts);
       string children_str;
       for (ScopeNode* sc : show_cnodes) {
-        children_str += sc->formatted_str;
+        if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+          children_str += sc->formatted_str;
+          sc->formatted_str.clear();
+        }
         node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        sc->mutable_proto()->mutable_children()->Clear();
+        node->show_children.push_back(sc);
         if (opts.account_displayed_op_only) {
           node->AggregateTotalStats(sc);
         }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.h b/tensorflow/tools/tfprof/internal/tfprof_scope.h
index a7c58920a2497377d65d70104d0d5e6c71d1b793..7bdcc794cd054c88859f4cde182b286f5f26db7e 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.h
@@ -37,34 +37,19 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 
-class ScopeNode : public ShowNode {
- public:
-  explicit ScopeNode(TFNode* node) : ShowNode(node) {}
-  ~ScopeNode() override {}
-
-  void AggregateTotalStats(ScopeNode* node) {
-    ShowNode::AggregateTotalStats(node);
-  }
-
-  void AddSelfToTotalStats() { ShowNode::AddSelfToTotalStats(); }
-
-  void ResetTotalStats() { ShowNode::ResetTotalStats(); }
-
-  std::vector<ScopeNode*> children;
-};
-
 class TFScope : public TFShow {
  public:
   explicit TFScope(checkpoint::CheckpointReader* ckpt_reader)
       : TFShow(ckpt_reader) {}
   ~TFScope() override {}
 
-  void AddNode(TFNode* node) override;
+  void AddNode(TFGraphNode* node) override;
 
   void Build() override;
 
  private:
-  const ShowNode* ShowInternal(const Options& opts) override;
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
 
   ScopeNode* CreateParentNode(const string& name);
 
@@ -79,7 +64,7 @@ class TFScope : public TFShow {
 
   std::vector<ScopeNode*> roots_;
   std::vector<std::unique_ptr<NodeDef>> node_defs_;
-  std::map<string, std::unique_ptr<TFNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<TFGraphNode>> parent_nodes_;
   std::map<string, std::unique_ptr<ScopeNode>> nodes_map_;
 };
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.cc b/tensorflow/tools/tfprof/internal/tfprof_show.cc
index 08ae82fea43cfb0e94e089e8e2945c969501f17d..b96db5468e71ad6c3ea9169ee2f77b0d00db1c3f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.cc
@@ -18,154 +18,32 @@ limitations under the License.
 #include <memory>
 #include <set>
 
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace tensorflow {
 namespace tfprof {
-ShowNode::ShowNode(TFNode* node) : node(node), account(true) {
-  mutable_proto()->set_name(name());
-  if (!node->device().empty()) {
-    mutable_proto()->set_device(node->device());
-  }
-  mutable_proto()->set_exec_micros(node->kernel_compute_micros());
-  mutable_proto()->set_requested_bytes(node->requested_byptes());
-  mutable_proto()->set_float_ops(node->float_ops());
-
-  if (!node->shape().empty()) {
-    int64 params = 1;
-    bool complete_shape = true;
-    for (int64 d : node->shape()) {
-      // Sometimes parameters could be <0 when a dim is unknown.
-      if (d < 0) {
-        complete_shape = false;
-        break;
-      }
-      params *= d;
-    }
-    if (complete_shape) {
-      mutable_proto()->set_parameters(proto_.parameters() + params);
-    } else {
-      fprintf(stderr, "Incomplete shape.");
-    }
-  }
-}
-
-string ShowNode::Format(const Options& opts) {
-  if (opts.select.empty()) {
-    return name();
-  }
-  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
-}
-
-string ShowNode::FormatMeta(const Options& opts) {
-  std::vector<string> info;
-  if (opts.select.find(kShown[2]) != opts.select.end()) {
-    const string shape = FormatShapes(node->shape());
-    if (!shape.empty()) {
-      info.push_back(shape);
-    }
-    string params = FormatNumber(proto().total_parameters()) + " params";
-    if (account) {
-      params = FormatNumber(proto().parameters()) + "/" + params;
-    } else {
-      params = "--/" + params;
-    }
-    info.push_back(params);
-  }
-  if (opts.select.find(kShown[3]) != opts.select.end()) {
-    string fops = FormatNumber(proto().total_float_ops()) + " flops";
-    if (account) {
-      fops = FormatNumber(proto().float_ops()) + "/" + fops;
-    } else {
-      fops = "--/" + fops;
-    }
-    info.push_back(fops);
-  }
-  if (opts.select.find(kShown[0]) != opts.select.end()) {
-    string memory = FormatMemory(proto().total_requested_bytes());
-    if (account) {
-      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
-
-    } else {
-      memory = "--/" + memory;
-    }
-    info.push_back(memory);
-  }
-  if (opts.select.find(kShown[1]) != opts.select.end()) {
-    string time = FormatTime(proto().total_exec_micros());
-    if (account) {
-      time = FormatTime(proto().exec_micros()) + "/" + time;
-    } else {
-      time = "--/" + time;
-    }
-    info.push_back(time);
-  }
-  if (opts.select.find(kShown[6]) != opts.select.end()) {
-    if (!proto().device().empty()) {
-      info.push_back(proto().device());
-    }
-  }
-  if (opts.select.find(kShown[7]) != opts.select.end()) {
-    std::set<string> op_types = node->op_types();
-    // Device is considered a type.
-    if (!proto().device().empty()) {
-      op_types.insert(proto().device());
-    }
-    info.push_back(str_util::Join(op_types, "|"));
-  }
-  return str_util::Join(info, ", ");
-}
-
-TFProfNode* ShowNode::mutable_proto() { return &proto_; }
-
-const TFProfNode& ShowNode::proto() const { return proto_; }
-
-void ShowNode::AggregateTotalStats(ShowNode* node) {
-  TFProfNode* node_pb = node->mutable_proto();
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         node_pb->total_exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             node_pb->total_requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        node_pb->total_parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       node_pb->total_float_ops());
-}
-
-void ShowNode::AddSelfToTotalStats() {
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         proto().exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             proto().requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        proto().parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       proto().float_ops());
-}
 
-void ShowNode::ResetTotalStats() {
-  mutable_proto()->set_total_exec_micros(0);
-  mutable_proto()->set_total_requested_bytes(0);
-  mutable_proto()->set_total_parameters(0);
-  mutable_proto()->set_total_float_ops(0);
-}
-
-const TFProfNode& TFShow::Show(const Options& opts) {
-  const ShowNode* root = ShowInternal(opts);
-  if (opts.dump_to_file.empty()) {
-    printf("%s", root->formatted_str.c_str());
-    fflush(stdout);
-  } else {
-    Status s = WriteStringToFile(Env::Default(), opts.dump_to_file,
-                                 root->formatted_str);
+const TFGraphNodeProto& TFShow::Show(const Options& opts) {
+  if (opts.output_type == kOutput[0]) {
+    Timeline timeline(opts.output_options.at(kTimelineOpts[0]));
+    return ShowInternal(opts, &timeline)->proto();
+  } else if (opts.output_type == kOutput[2]) {
+    const ShowNode* root = ShowInternal(opts, nullptr);
+    Status s =
+        WriteStringToFile(Env::Default(), opts.output_options.at(kFileOpts[0]),
+                          root->formatted_str);
     if (!s.ok()) {
       fprintf(stderr, "%s\n", s.ToString().c_str());
     }
+    return root->proto();
+  } else {
+    const ShowNode* root = ShowInternal(opts, nullptr);
+    printf("%s", root->formatted_str.c_str());
+    fflush(stdout);
+    return root->proto();
   }
-  return root->proto();
 }
 
 bool TFShow::LookUpCheckPoint(const string& name,
@@ -205,10 +83,13 @@ bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
     show = true;
   } else {
     for (const string& regex : opts.device_regexes) {
-      if (RE2::FullMatch(node->proto().device(), regex)) {
-        show = true;
-        break;
+      for (const string& device : node->proto().devices()) {
+        if (RE2::FullMatch(device, regex)) {
+          show = true;
+          break;
+        }
       }
+      if (show) break;
     }
   }
   // Don't show if device_regexes don't cover it.
@@ -254,11 +135,11 @@ bool TFShow::ShouldAccount(ShowNode* node, const Options& opts) {
         return true;
       }
     }
-    if (RE2::FullMatch(node->proto().device(), regex)) {
-      return true;
-    }
+    for (const string& device : node->proto().devices())
+      if (RE2::FullMatch(device, regex)) {
+        return true;
+      }
   }
-
   return false;
 }
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.h b/tensorflow/tools/tfprof/internal/tfprof_show.h
index a17358bb6b4b95ef1f28678529a37c9517c28c4a..803b3010442f0c3f607cd3fd30b000a7c0838f8f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.h
@@ -28,51 +28,27 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
 #include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
-class ShowNode {
- public:
-  explicit ShowNode(TFNode* node);
-  virtual ~ShowNode() {}
-
-  const string& name() const { return node->node_def()->name(); }
-  TFProfNode* mutable_proto();
-  const TFProfNode& proto() const;
-
-  string Format(const Options& opts);
-
-  string FormatMeta(const Options& opts);
-
-  TFNode* node;
-  bool account;
-  string formatted_str;
-
- protected:
-  void AggregateTotalStats(ShowNode* node);
-
-  void AddSelfToTotalStats();
-
-  void ResetTotalStats();
-
-  TFProfNode proto_;
-};
-
 class TFShow {
  public:
   explicit TFShow(checkpoint::CheckpointReader* ckpt_reader)
       : ckpt_reader_(ckpt_reader) {}
   virtual ~TFShow() {}
-  virtual void AddNode(TFNode* node) = 0;
+  virtual void AddNode(TFGraphNode* node) = 0;
   virtual void Build() = 0;
-  const TFProfNode& Show(const Options& opts);
+  const TFGraphNodeProto& Show(const Options& opts);
 
  protected:
-  virtual const ShowNode* ShowInternal(const Options& opts) = 0;
+  virtual const ShowNode* ShowInternal(const Options& opts,
+                                       Timeline* timeline) = 0;
 
   bool LookUpCheckPoint(const string& name,
                         std::unique_ptr<TFProfTensor>* tensor);
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_code.cc b/tensorflow/tools/tfprof/internal/tfprof_show_code.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfec09ad193a46c35cf452d1dc50f5a38235ae72
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_code.cc
@@ -0,0 +1,140 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_show_code.h"
+
+#include <memory>
+#include <set>
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_scope.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+const TFCodeNodeProto& TFShowCode::Show(const Options& opts) {
+  if (opts.output_type == kOutput[0]) {
+    Timeline timeline(opts.output_options.at(kTimelineOpts[0]));
+    return ShowInternal(opts, &timeline)->proto();
+  } else if (opts.output_type == kOutput[2]) {
+    const ShowCodeNode* root = ShowInternal(opts, nullptr);
+    Status s =
+        WriteStringToFile(Env::Default(), opts.output_options.at(kFileOpts[0]),
+                          root->formatted_str);
+    if (!s.ok()) {
+      fprintf(stderr, "%s\n", s.ToString().c_str());
+    }
+    return root->proto();
+  } else {
+    const ShowCodeNode* root = ShowInternal(opts, nullptr);
+    printf("%s", root->formatted_str.c_str());
+    fflush(stdout);
+    return root->proto();
+  }
+}
+
+bool TFShowCode::ShouldShow(ShowCodeNode* node, const Options& opts,
+                            int depth) {
+  // Always show kTFProfRoot.
+  if (node->name() == kTFProfRoot) return true;
+
+  if (!node->account) return false;
+  // TODO(xpan): Think more carefully about node filtering in code view.
+  // Unlike graph/scope view, which users want to see the exact leaf op.
+  // In code view, users want to see the middle code traces they wrote.
+  //
+  // This is a subtle difference from scope/graph view. Usually mostly
+  // want to see the middle code traces (i.e. their own codes.), instead
+  // of the TensorFlow internal codes traces.
+  if (node->proto().total_requested_bytes() < opts.min_bytes ||
+      node->proto().total_exec_micros() < opts.min_micros ||
+      node->proto().total_parameters() < opts.min_params ||
+      node->proto().total_float_ops() < opts.min_float_ops ||
+      depth > opts.max_depth || !ShouldShowIfExtra(node, opts, depth)) {
+    return false;
+  }
+
+  bool show = false;
+  if (opts.device_regexes.size() == 1 && opts.device_regexes[0] == ".*") {
+    show = true;
+  } else {
+    for (const string& regex : opts.device_regexes) {
+      for (const string& device : node->node->devices()) {
+        if (RE2::FullMatch(device, regex)) {
+          show = true;
+          break;
+        }
+      }
+      if (show) break;
+    }
+  }
+  // Don't show if device_regexes don't cover it.
+  if (!show) return false;
+
+  show = false;
+  if (opts.show_name_regexes.size() == 1 && opts.show_name_regexes[0] == ".*") {
+    show = true;
+  } else {
+    for (const string& regex : opts.show_name_regexes) {
+      if (RE2::FullMatch(node->name(), regex)) {
+        show = true;
+        break;
+      }
+    }
+  }
+  // Don't show if show_name_regexes don't cover it.
+  if (!show) return false;
+  // Don't show if hide_name_regexes cover it.
+  for (const string& regex : opts.hide_name_regexes) {
+    if (RE2::FullMatch(node->name(), regex)) return false;
+  }
+  return true;
+}
+
+bool TFShowCode::ShouldTrim(ShowCodeNode* node,
+                            const std::vector<string>& regexes) {
+  for (const string& regex : regexes) {
+    if (RE2::FullMatch(node->name(), regex)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool TFShowCode::ShouldAccount(ShowCodeNode* node, const Options& opts) {
+  if (opts.account_type_regexes.size() == 1 &&
+      opts.account_type_regexes[0] == ".*") {
+    return true;
+  }
+  for (const string& regex : opts.account_type_regexes) {
+    for (const string& type : node->node->op_types()) {
+      if (RE2::FullMatch(type, regex)) {
+        return true;
+      }
+    }
+    for (const string& device : node->node->devices()) {
+      if (RE2::FullMatch(device, regex)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_code.h b/tensorflow/tools/tfprof/internal/tfprof_show_code.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbfd38945fc669c3ab3dafbec584cb723f7f24cc
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_code.h
@@ -0,0 +1,103 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Parent class and utilities for tfprof_code.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_CODE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_CODE_H_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFShowCode {
+ public:
+  explicit TFShowCode() {}
+  virtual ~TFShowCode() {}
+  virtual void AddNode(TFGraphNode* node) = 0;
+  virtual void Build() = 0;
+  const TFCodeNodeProto& Show(const Options& opts);
+
+ protected:
+  virtual const ShowCodeNode* ShowInternal(const Options& opts,
+                                           Timeline* timeline) = 0;
+
+  bool LookUpCheckPoint(const string& name,
+                        std::unique_ptr<TFProfTensor>* tensor);
+
+  // Overridden by subclass if extra requirements need to be met.
+  virtual bool ShouldShowIfExtra(ShowCodeNode* node, const Options& opts,
+                                 int depth) {
+    return true;
+  }
+
+  bool ShouldShow(ShowCodeNode* node, const Options& opts, int depth);
+
+  bool ShouldTrim(ShowCodeNode* node, const std::vector<string>& regexes);
+
+  bool ShouldAccount(ShowCodeNode* node, const Options& opts);
+
+  template <typename T>
+  std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
+    if (opts.order_by.empty() || nodes.empty()) {
+      return nodes;
+    }
+    std::vector<T*> sorted_nodes = nodes;
+    std::sort(sorted_nodes.begin(), sorted_nodes.end(),
+              [&opts](const T* n1, const T* n2) {
+                if (n1->name() == kTFProfRoot) return true;
+                if (n2->name() == kTFProfRoot) return false;
+                bool name_cmp = n1->name() < n2->name();
+                if (opts.order_by == kOrderBy[0]) {
+                  return name_cmp;
+                } else if (opts.order_by == kOrderBy[1]) {
+                  return n1->proto().total_requested_bytes() >
+                         n2->proto().total_requested_bytes();
+                } else if (opts.order_by == kOrderBy[2]) {
+                  return n1->proto().total_exec_micros() >
+                         n2->proto().total_exec_micros();
+                } else if (opts.order_by == kOrderBy[3]) {
+                  return n1->proto().total_parameters() >
+                         n2->proto().total_parameters();
+                } else if (opts.order_by == kOrderBy[4]) {
+                  return n1->proto().total_float_ops() >
+                         n2->proto().total_float_ops();
+                }
+                return name_cmp;
+              });
+    return sorted_nodes;
+  }
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_CODE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
index ffaa576639e28435bb0d97537bb262ce5865b6db..f0621c9af0f24c960ff47ab754812a5a10919f7b 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
@@ -75,7 +75,7 @@ TEST_F(TFProfShowTest, DumpScopeMode) {
                {"VariableV2"},  // accout_type_regexes
                {".*"}, {""}, {".*"}, {""}, false,
                {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false, dump_file);
+               "file", {{"outfile", dump_file}});
   tf_stats_->PrintGraph("scope", opts);
 
   string dump_str;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.cc b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
index edc0689d69968f7ebf36c88c36d76ed329b88eeb..566b4cee440e6ced508e19eff2e79ca9df1b8131 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -56,37 +58,52 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
   printf("Preparing Views...\n");
   scope_view_ = std::unique_ptr<TFScope>(new TFScope(ckpt_reader_.get()));
   graph_view_ = std::unique_ptr<TFGraph>(new TFGraph(ckpt_reader_.get()));
+  code_view_ = std::unique_ptr<TFCode>(new TFCode());
+
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     scope_view_->AddNode(&it->second);
     graph_view_->AddNode(&it->second);
+    code_view_->AddNode(&it->second);
   }
   scope_view_->Build();
   graph_view_->Build();
+  code_view_->Build();
 }
 
-const TFProfNode& TFStats::PrintGraph(const string& cmd, const Options& opts) {
+const TFGraphNodeProto& TFStats::PrintGraph(const string& cmd,
+                                            const Options& opts) {
   if (cmd == kCmds[0]) {
     return scope_view_->Show(opts);
   } else if (cmd == kCmds[1]) {
     return graph_view_->Show(opts);
   } else {
     fprintf(stderr, "Unknown command: %s\n", cmd.c_str());
-    return empty_node_;
+    return empty_graph_node_;
   }
 }
 
+const TFCodeNodeProto& TFStats::PrintCode(const Options& opts) {
+  return code_view_->Show(opts);
+}
+
 void TFStats::ParseGraph() {
   for (const NodeDef& node : graph_->node()) {
     CHECK(nodes_map_.find(node.name()) == nodes_map_.end());
-    nodes_map_[node.name()] = TFNode(&node);
+    nodes_map_[node.name()] = TFGraphNode(&node);
   }
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     const NodeDef* node_def = it->second.node_def();
     for (string node_input : node_def->input()) {
+      int output_idx = 0;
       // input name format can be: "^node:src_output"
       auto prefix_pos = node_input.find(":");
       if (prefix_pos != node_input.npos) {
-        node_input.substr(0, prefix_pos);
+        std::vector<string> input_parts = str_util::Split(node_input, ":");
+        CHECK(input_parts.size() == 2)
+            << "Unknown NodeDef.input format: " << node_input;
+        node_input = input_parts[0];
+        CHECK(strings::safe_strto32(input_parts[1], &output_idx))
+            << "Failed to parse integer: " << output_idx;
       }
       if (node_input.substr(0, 1) == "^") {
         node_input = node_input.substr(1);
@@ -95,7 +112,7 @@ void TFStats::ParseGraph() {
       if (input_node == nodes_map_.end()) {
         continue;
       }
-      it->second.AddInput(&input_node->second);
+      it->second.AddInput(&input_node->second, output_idx);
     }
   }
 }
@@ -110,6 +127,9 @@ void TFStats::ParseOpLog() {
     if (entry.float_ops()) {
       node->second.AddFloatOps(entry.float_ops());
     }
+    if (entry.has_code_def()) {
+      node->second.AddCode(&entry.code_def());
+    }
   }
 }
 
@@ -125,20 +145,6 @@ void TFStats::ParseRunMeta() {
       node->second.AddStepStat(dev_stat.device(), &node_stat);
     }
   }
-
-  if (!run_meta_->has_cost_graph()) {
-    fprintf(stderr,
-            "Missing CostGraphDef in RunMetadata.\nMaybe you forget to"
-            "set tf.ConfigProto(graph_options=tf.GraphOptions("
-            "build_cost_model=1)) to Session()\n");
-  }
-  for (const auto& node_pb : run_meta_->cost_graph().node()) {
-    auto node = nodes_map_.find(node_pb.name());
-    if (node == nodes_map_.end()) {
-      continue;
-    }
-    node->second.AddNodeStat(&node_pb);
-  }
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.h b/tensorflow/tools/tfprof/internal/tfprof_stats.h
index 3a8b46ae315a4f2b1211a20a712ce5f20ee33632..585dca6771a1a506464ecd8f7bb5be09b2d56a91 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_code.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_graph.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
@@ -56,7 +57,8 @@ class TFStats {
 
   // Prints the results to stdout. Also returns the printed output in
   // a proto.
-  const TFProfNode& PrintGraph(const string& cmd, const Options& opts);
+  const TFGraphNodeProto& PrintGraph(const string& cmd, const Options& opts);
+  const TFCodeNodeProto& PrintCode(const Options& opts);
 
  private:
   void ParseGraph();
@@ -67,13 +69,16 @@ class TFStats {
 
   std::unique_ptr<TFScope> scope_view_;
   std::unique_ptr<TFGraph> graph_view_;
+  std::unique_ptr<TFCode> code_view_;
   std::unique_ptr<GraphDef> graph_;
   std::unique_ptr<RunMetadata> run_meta_;
   std::unique_ptr<OpLog> op_log_;
   std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader_;
-  // Store TFNode instead of TFNode* to avoid large number of dynamic alloc.
-  std::map<string, TFNode> nodes_map_;
-  TFProfNode empty_node_;
+  // Store TFGraphNode instead of TFGraphNode* to avoid large number of
+  // dynamic alloc.
+  std::map<string, TFGraphNode> nodes_map_;
+  TFGraphNodeProto empty_graph_node_;
+  TFCodeNodeProto empty_code_node_;
 };
 
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
index 3c97f0eb65adc171abcbf7ad39d22f7739bdd9f9..eb01425e044d46a5dd62d0a1f2fc3b87f45c0e6f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
@@ -74,30 +74,30 @@ TEST_F(TFProfStatsTest, CustomOpType) {
   Options opts(3, 0, 0, 0, 0, {".*"}, "name",
                {kTrainableVarType},  // accout_type_regexes
                {".*"}, {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, "",
+               {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
       "370\nchildren {\n  name: \"conv2d/bias\"\n  exec_micros: 1\n  "
       "requested_bytes: 20\n  parameters: 5\n  total_exec_micros: 1\n  "
-      "total_requested_bytes: 20\n  total_parameters: 5\n  device: "
+      "total_requested_bytes: 20\n  total_parameters: 5\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
       "total_float_ops: 0\n}\nchildren {\n  name: \"conv2d/kernel\"\n  "
       "exec_micros: 1\n  requested_bytes: 540\n  parameters: 135\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 540\n  total_parameters: "
-      "135\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "135\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/bias\"\n  "
       "exec_micros: 1\n  requested_bytes: 20\n  parameters: 5\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 20\n  total_parameters: "
-      "5\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "5\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/kernel\"\n  "
       "exec_micros: 2\n  requested_bytes: 900\n  parameters: 225\n  "
       "total_exec_micros: 2\n  total_requested_bytes: 900\n  total_parameters: "
-      "225\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "225\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -107,29 +107,29 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
   Options opts(
       3, 0, 0, 0, 0, {".*"}, "name", {kCkptVarType},  // accout_type_regexes
       {".*"}, {""}, {".*"}, {""}, false,
-      {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+      {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
       "370\nchildren {\n  name: \"conv2d/bias\"\n  exec_micros: 1\n  "
       "requested_bytes: 20\n  parameters: 5\n  total_exec_micros: 1\n  "
-      "total_requested_bytes: 20\n  total_parameters: 5\n  device: "
+      "total_requested_bytes: 20\n  total_parameters: 5\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
       "total_float_ops: 0\n}\nchildren {\n  name: \"conv2d/kernel\"\n  "
       "exec_micros: 1\n  requested_bytes: 540\n  parameters: 135\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 540\n  total_parameters: "
-      "135\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "135\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/bias\"\n  "
       "exec_micros: 1\n  requested_bytes: 20\n  parameters: 5\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 20\n  total_parameters: "
-      "5\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "5\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/kernel\"\n  "
       "exec_micros: 2\n  requested_bytes: 900\n  parameters: 225\n  "
       "total_exec_micros: 2\n  total_requested_bytes: 900\n  total_parameters: "
-      "225\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "225\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -139,11 +139,11 @@ TEST_F(TFProfStatsTest, TestGraph) {
   Options opts(100, 0, 10000, 0, 0, {".*"}, "name", {".*"},
                {"cost.*"},  // start_name_regexes
                {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("graph", opts);
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, "",
+               {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("graph", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: 0\ninputs: "
       "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
@@ -154,28 +154,28 @@ TEST_F(TFProfStatsTest, TestGraph) {
 
 TEST_F(TFProfStatsTest, TestFloatOps) {
   Options opts(10, 0, 0, 0, 1, {".*"}, "name", {".*"}, {".*"}, {""}, {".*"},
-               {""}, false, {"float_ops"}, false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               {""}, false, {"float_ops"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 96\ntotal_requested_bytes: "
       "8656\ntotal_parameters: 370\nchildren {\n  name: \"conv2d/BiasAdd\"\n  "
       "exec_micros: 12\n  requested_bytes: 1440\n  total_exec_micros: 12\n  "
-      "total_requested_bytes: 1440\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 1440\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 360\n  "
       "total_float_ops: 360\n}\nchildren {\n  name: \"conv2d/convolution\"\n  "
       "exec_micros: 60\n  requested_bytes: 1440\n  total_exec_micros: 60\n  "
-      "total_requested_bytes: 1440\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 1440\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 19440\n  "
       "total_float_ops: 19440\n}\nchildren {\n  name: \"conv2d_2/BiasAdd\"\n  "
       "exec_micros: 2\n  requested_bytes: 640\n  total_exec_micros: 2\n  "
-      "total_requested_bytes: 640\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 640\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 160\n  "
       "total_float_ops: 160\n}\nchildren {\n  name: \"conv2d_2/convolution\"\n "
       " exec_micros: 13\n  requested_bytes: 640\n  total_exec_micros: 13\n  "
-      "total_requested_bytes: 640\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 640\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 14400\n  "
       "total_float_ops: 14400\n}\nfloat_ops: 0\ntotal_float_ops: 34360\n",
       &expected));
@@ -186,10 +186,10 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
   Options opts(100, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
                {"unit_2_1.*DW"},  // show_name_regexes.
                {""}, true,        // account_displayed_op_only.
-               {"params"}, false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               {"params"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
@@ -202,9 +202,9 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
   Options opts(10, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
                {"unit_1_0.*gamma"}, {""}, false,
                {"tensor_value"},  // Show tensor value from checkpoint.
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
-  TFProfNode expected;
+               "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 96\ntotal_requested_bytes: "
diff --git a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
index 8c19910355baa077bd38412ae56af7617edc2cdd..79a781210dbba00e3522710a516d64933af207d6 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
@@ -57,10 +57,10 @@ class TFProfTensorTest : public ::testing::Test {
 TEST_F(TFProfTensorTest, Basics) {
   Options opts(3, 0, 0, 0, 0, {".*"}, "name", {"VariableV2"}, {".*"}, {""},
                {".*"}, {""}, false, {"tensor_value"},  // show the tensor value.
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.cc b/tensorflow/tools/tfprof/internal/tfprof_timeline.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5640c0e569d3ea809ff04673146ac377ca844ef
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline.cc
@@ -0,0 +1,245 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
+                                              const string& category,
+                                              const string& name, int64 pid,
+                                              int64 tid, int64 ts) {
+  Json::Value event(Json::objectValue);
+  event["ph"] = Json::Value(ph);
+  event["cat"] = Json::Value(category);
+  event["name"] = Json::Value(name);
+  event["pid"] = Json::Value(pid);
+  event["tid"] = Json::Value(tid);
+  event["ts"] = Json::Value(ts);
+  return event;
+}
+
+void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) {
+  Json::Value event(Json::objectValue);
+  event["name"] = Json::Value("process_name");
+  event["ph"] = Json::Value("M");
+  event["pid"] = Json::Value(pid);
+  Json::Value args(Json::objectValue);
+  args["name"] = Json::Value(name);
+  event["args"] = args;
+  metadata_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
+                                      int64 tid, const string& category,
+                                      const string& name, Json::Value args) {
+  Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
+  event["dur"] = Json::Value(duration);
+  event["args"] = std::move(args);
+  metadata_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts,
+                                         int64 pid, int64 tid, int64 flow_id) {
+  Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
+  event["id"] = flow_id;
+  events_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
+                                       int64 tid, int64 flow_id) {
+  Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
+  event["id"] = flow_id;
+  events_.push_back(event);
+}
+
+string ChromeTraceFormatter::Format() {
+  Json::Value trace;
+  trace["traceEvents"] = Json::Value(Json::arrayValue);
+  for (const Json::Value& v : metadata_) {
+    trace["traceEvents"].append(v);
+  }
+  for (const Json::Value& v : events_) {
+    trace["traceEvents"].append(v);
+  }
+  return trace.toStyledString();
+}
+
+void Timeline::GenerateGraphTimeline(const GraphNode* gnode) {
+  fprintf(stdout, "adding graph nodes.\n");
+  AddGraphNode(gnode);
+  AllocateLanes();
+  fprintf(stdout, "generating trace file.\n");
+  int64 flow_id = 1;
+  for (const auto& process : alloc_nodes_) {
+    for (const auto& lane : process.second) {
+      for (const auto& node : lane.second) {
+        TimeNode* tnode = node.second;
+
+        Json::Value args(Json::objectValue);
+        args["name"] = Json::Value(tnode->name);
+        args["op"] = Json::Value(tnode->name);
+        chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
+                                     process.first, lane.first, "Op",
+                                     tnode->name, args);
+
+        for (TimeNode* next_tnode : node.second->next_tnodes) {
+          chrome_formatter_.EmitFlowStart(
+              tnode->name + "_flow", tnode->start_micros + tnode->exec_micros,
+              process.first, lane.first, flow_id);
+          chrome_formatter_.EmitFlowEnd(
+              tnode->name + "_flow", next_tnode->start_micros,
+              next_tnode->process->pid, next_tnode->tid, flow_id);
+          flow_id += 1;
+        }
+      }
+    }
+  }
+  OutputTimeline();
+}
+
+void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
+  std::set<int64> visited_depth;
+  EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
+  OutputTimeline();
+}
+
+void Timeline::GenerateCodeTimeline(const CodeNode* node) {
+  std::set<int64> visited_depth;
+  EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
+  OutputTimeline();
+}
+
+void Timeline::OutputTimeline() {
+  Status s =
+      WriteStringToFile(Env::Default(), outfile_, chrome_formatter_.Format());
+  if (!s.ok()) {
+    fprintf(stderr, "Failed to write timeline file: %s\nError: %s\n",
+            outfile_.c_str(), s.ToString().c_str());
+    return;
+  }
+  fprintf(stdout, "\n******************************************************\n");
+  fprintf(stdout,
+          "Timeline file is written to %s.\n"
+          "Open a Chrome browser, enter URL chrome://tracing and "
+          "load the timeline file.",
+          outfile_.c_str());
+  fprintf(stdout, "\n******************************************************\n");
+  fflush(stdout);
+}
+
+std::vector<TimeNode*> Timeline::AddGraphNode(const GraphNode* gnode) {
+  std::vector<TimeNode*> tnodes;
+  if (!gnode) return tnodes;
+
+  std::vector<TimeNode*> shown_cinputs;
+  for (GraphNode* schild : gnode->show_children) {
+    std::vector<TimeNode*> inputs = AddGraphNode(schild);
+    shown_cinputs.insert(shown_cinputs.end(), inputs.begin(), inputs.end());
+  }
+  if (!gnode->node->step_stats()) {
+    return shown_cinputs;
+  }
+
+  const TFGraphNode* node = gnode->node;
+  for (const auto& kernel_execs : node->op_kernel_execs()) {
+    const string& device = kernel_execs.first;
+    const std::vector<std::pair<int64, int64>>& execs = kernel_execs.second;
+
+    if (process_.find(device) == process_.end()) {
+      int64 pid = AllocatePID();
+      process_[device].reset(new Process(pid));
+      chrome_formatter_.EmitPID(device, pid);
+    }
+    Process* p = process_[device].get();
+
+    for (const auto& exec : execs) {
+      int64 start_micros = exec.first;
+      int64 exec_micros = exec.second;
+      // TODO(xpan): There might be start time duplication here.
+      if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
+        // TODO(xpan): Give each kernel call a unique_name.
+        tnodes_[device][start_micros].reset(
+            new TimeNode(p, node->name(), start_micros, exec_micros));
+      }
+      TimeNode* tnode_ptr = tnodes_[device][start_micros].get();
+
+      for (int i = 0; i < shown_cinputs.size(); i++) {
+        shown_cinputs[i]->next_tnodes.push_back(tnode_ptr);
+      }
+      tnodes.push_back(tnode_ptr);
+    }
+  }
+  return tnodes;
+}
+
+void Timeline::AllocateLanes() {
+  for (auto& process : tnodes_) {
+    Process* p = process_[process.first].get();
+    for (auto& tnode : process.second) {
+      int64 start_time = tnode.second->start_micros;
+      int64 end_time = tnode.second->exec_micros - 1;
+
+      int64 l = -1;
+      for (int i = 0; i < p->lanes.size(); ++i) {
+        const auto& lane = p->lanes[i];
+        auto cur_it = lane.lower_bound(start_time);
+        if (cur_it == lane.end()) {
+          --cur_it;
+        }
+        l = i;
+        for (; cur_it != lane.begin(); --cur_it) {
+          if (cur_it->second < start_time) {
+            break;
+          }
+          if (cur_it->first <= end_time) {
+            l = -1;
+            break;
+          }
+        }
+        if (l >= 0) {
+          break;
+        }
+      }
+      if (l < 0) {
+        l = p->lanes.size();
+        std::map<int64, int64> nlane;
+        nlane[start_time] = end_time;
+        p->lanes.push_back(nlane);
+      } else {
+        p->lanes[l][start_time] = end_time;
+      }
+      tnode.second->tid = l;
+      alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
+    }
+  }
+}
+
+int64 Timeline::AllocatePID() {
+  int64 cur_pid = next_pid_;
+  next_pid_ += 1;
+  return cur_pid;
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.h b/tensorflow/tools/tfprof/internal/tfprof_timeline.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d26874abd2836b8725d9630e7f4b9ca61df1aa9
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline.h
@@ -0,0 +1,147 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+
+#include "include/json/json.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+typedef std::map<string, string> Event;
+
+class ChromeTraceFormatter {
+ public:
+  ChromeTraceFormatter() {}
+
+  Json::Value CreateEvent(const string& ph, const string& category,
+                          const string& name, int64 pid, int64 tid, int64 ts);
+
+  void EmitPID(const string& name, int64 pid);
+
+  void EmitRegion(int64 ts, int64 duration, int64 pid, int64 tid,
+                  const string& category, const string& name, Json::Value args);
+
+  void EmitFlowStart(const string& name, int64 ts, int64 pid, int64 tid,
+                     int64 flow_id);
+
+  void EmitFlowEnd(const string& name, int64 ts, int64 pid, int64 tid,
+                   int64 flow_id);
+
+  string Format();
+
+ private:
+  std::vector<Json::Value> events_;
+  std::vector<Json::Value> metadata_;
+};
+
+class Process {
+ public:
+  Process(int64 pid) : pid(pid) {}
+
+  // Each lane is a map from start_time to end_time.
+  std::vector<std::map<int64, int64>> lanes;
+  int64 pid;
+};
+
+class TimeNode {
+ public:
+  TimeNode(Process* process, const string& name, int64 start_micros,
+           int64 exec_micros)
+      : process(process),
+        name(name),
+        start_micros(start_micros),
+        exec_micros(exec_micros),
+        tid(-1) {}
+  virtual ~TimeNode() {}
+
+  Process* process;
+  string name;
+  int64 start_micros;
+  int64 exec_micros;
+  int64 tid;
+  std::vector<TimeNode*> next_tnodes;
+};
+
+class Timeline {
+ public:
+  Timeline(const string& outfile) : outfile_(outfile) {}
+  ~Timeline() {}
+
+  void GenerateGraphTimeline(const GraphNode* gnode);
+
+  void GenerateScopeTimeline(const ScopeNode* node);
+
+  void GenerateCodeTimeline(const CodeNode* node);
+
+ private:
+  void OutputTimeline();
+
+  template <typename Node>
+  void EmitTreeNode(const Node* node, int64 start_time, int64 duration,
+                    int64 depth, std::set<int64>* visited_depth) {
+    if (visited_depth->find(depth) == visited_depth->end()) {
+      chrome_formatter_.EmitPID(strings::StrCat("Scope:", depth), depth);
+      visited_depth->insert(depth);
+    }
+
+    Json::Value args(Json::objectValue);
+    args["name"] = Json::Value(node->name());
+    args["op"] = Json::Value(node->name());
+    chrome_formatter_.EmitRegion(start_time, duration, depth, 0, "Op",
+                                 node->name(), args);
+
+    int64 total_micros = 0;
+    int64 c_start_time = start_time;
+    for (const Node* child : node->show_children) {
+      int64 total_exec_micros = child->proto().total_exec_micros();
+      if (total_exec_micros <= 0) {
+        continue;
+      }
+      EmitTreeNode(child, c_start_time, total_exec_micros, depth + 1,
+                   visited_depth);
+      c_start_time += total_exec_micros;
+      total_micros += total_exec_micros;
+    }
+    CHECK(total_micros <= duration) << node->name() << " parent:" << duration
+                                    << " children:" << total_micros;
+  }
+
+  std::vector<TimeNode*> AddGraphNode(const GraphNode* gnode);
+
+  void AllocateLanes();
+
+  int64 AllocatePID();
+
+  const string outfile_;
+  int64 next_pid_ = 0;
+  int64 allocator_pid_ = -1;
+  ChromeTraceFormatter chrome_formatter_;
+  std::map<string, int64> device_pids_;
+
+  std::map<string, std::unique_ptr<Process>> process_;
+  std::map<int64, std::map<int64, std::map<int64, TimeNode*>>> alloc_nodes_;
+  std::map<string, std::map<int64, std::unique_ptr<TimeNode>>> tnodes_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2dfe6ab335e11eab5464dec4390868e3f6518fa8
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+
+#include <utility>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFProfTimelineTest : public ::testing::Test {
+ protected:
+  TFProfTimelineTest() {
+    string graph_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/graph.pbtxt");
+    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    TF_CHECK_OK(ReadGraphDef(Env::Default(), graph_path, graph_pb.get()));
+
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
+        new tensorflow::RunMetadata());
+    string run_meta_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/run_meta");
+    TF_CHECK_OK(
+        ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
+
+    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
+                                nullptr, nullptr));
+  }
+
+  std::unique_ptr<TFStats> tf_stats_;
+};
+
+// Before adding test, first dump the json file and
+// manually check it's correct
+TEST_F(TFProfTimelineTest, GraphView) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(10000, 0, 0, 0, 0, {".*"}, "name",
+               {".*"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
+               "timeline", {{"outfile", dump_file}});
+  tf_stats_->PrintGraph("graph", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(14171250174278825648ull, Hash64(dump_str));
+}
+
+TEST_F(TFProfTimelineTest, ScopeView) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(5, 0, 0, 0, 0, {".*"}, "name", {".*"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
+               "timeline", {{"outfile", dump_file}});
+  tf_stats_->PrintGraph("scope", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(2355241164346147404ull, Hash64(dump_str));
+}
+
+// TODO(xpan): tfprof_log is too large to include in testdata when adding
+// code traces.
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_utils.cc b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
index 6d557e91933648f5dd82dc4c1daee6717ed296d5..8e55e009d3346cf932c094d304d456528c00cdce 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_utils.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
@@ -94,7 +94,7 @@ string StripQuote(const string& s) {
   return s.substr(start, end - start + 1);
 }
 
-tensorflow::Status ReturnError(const std::vector<string> pieces, int idx) {
+tensorflow::Status ReturnError(const std::vector<string>& pieces, int idx) {
   string val;
   if (pieces.size() > idx + 1) {
     val = pieces[idx + 1];
@@ -251,19 +251,13 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
       opts->select = requested_set;
       ++i;
     } else if (pieces[i] == tensorflow::tfprof::kOptions[14]) {
-      if ((pieces.size() > i + 1 && pieces[i + 1].find("-") == 0) ||
-          pieces.size() == i + 1) {
-        opts->viz = true;
-      } else if (!StringToBool(pieces[i + 1], &opts->viz)) {
-        return ReturnError(pieces, i);
-      } else {
-        ++i;
-      }
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[15]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
-      opts->dump_to_file = StripQuote(pieces[i + 1]);
+
+      tensorflow::Status s =
+          ParseOutput(pieces[i + 1], &opts->output_type, &opts->output_options);
+      if (!s.ok()) return s;
       ++i;
     } else {
       return ReturnError(pieces, i);
diff --git a/tensorflow/tools/tfprof/tfprof_log.proto b/tensorflow/tools/tfprof/tfprof_log.proto
index cae6e1e3a8c08f64e28460c2850d5f6beeb69e61..5c47142e0ab6e3f647d869016a8ab4f9f9eb9e99 100644
--- a/tensorflow/tools/tfprof/tfprof_log.proto
+++ b/tensorflow/tools/tfprof/tfprof_log.proto
@@ -2,6 +2,17 @@ syntax = "proto2";
 
 package tensorflow.tfprof;
 
+// It specifies the Python callstack that creates an op.
+message CodeDef {
+  repeated Trace traces = 1;
+  message Trace {
+    optional string file = 1;
+    optional int32 lineno = 2;
+    optional string function = 3;
+    optional string line = 4;
+  }
+}
+
 message OpLogEntry {
   // op name.
   optional string name = 1;
@@ -12,6 +23,8 @@ message OpLogEntry {
   // User can define extra op type information for an op. This allows the user
   // to select a group of ops precisely using op_type as a key.
   repeated string types = 3;
+  // Used to support tfprof "code" view.
+  optional CodeDef code_def = 4;
 }
 
 message OpLog {
diff --git a/tensorflow/tools/tfprof/tfprof_main.cc b/tensorflow/tools/tfprof/tfprof_main.cc
index a8ed6e38132df19391b4a8bdfa69a9a3254439f8..cfe239da229246c2d22706aa6d4c73cfac6a2e73 100644
--- a/tensorflow/tools/tfprof/tfprof_main.cc
+++ b/tensorflow/tools/tfprof/tfprof_main.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
@@ -82,8 +83,7 @@ int main(int argc, char** argv) {
   tensorflow::string FLAGS_hide_name_regexes;
   bool FLAGS_account_displayed_op_only = false;
   tensorflow::string FLAGS_select = "params";
-  bool FLAGS_viz = false;
-  tensorflow::string FLAGS_dump_to_file = "";
+  tensorflow::string FLAGS_output = "";
   for (int i = 0; i < argc; i++) {
     fprintf(stderr, "%s\n", argv[i]);
   }
@@ -117,7 +117,7 @@ int main(int argc, char** argv) {
                        &FLAGS_account_displayed_op_only,
                        "account displayed op only"),
       tensorflow::Flag("select", &FLAGS_select, "select"),
-      tensorflow::Flag("dump_to_file", &FLAGS_dump_to_file, "dump to file"),
+      tensorflow::Flag("output", &FLAGS_output, "output"),
   };
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
@@ -144,6 +144,12 @@ int main(int argc, char** argv) {
   std::vector<tensorflow::string> select =
       Split(FLAGS_select, ',', tensorflow::str_util::SkipEmpty());
 
+  tensorflow::string output_type;
+  std::map<tensorflow::string, tensorflow::string> output_options;
+  tensorflow::Status s = tensorflow::tfprof::ParseOutput(
+      FLAGS_output, &output_type, &output_options);
+  CHECK(s.ok()) << s.ToString();
+
   tensorflow::string cmd = "";
   if (argc == 1 && FLAGS_graph_path.empty()) {
     printf("1) go/tfprof: Tutorial.\n");
@@ -160,12 +166,13 @@ int main(int argc, char** argv) {
         "Profiling everything!\n");
     return 0;
   } else if (argc > 1) {
-    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[3]) {
+    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[4]) {
       tensorflow::tfprof::PrintHelp();
       return 0;
     }
     if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[0] ||
-        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[1]) {
+        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[1] ||
+        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[2]) {
       cmd = argv[1];
     }
   }
@@ -185,10 +192,18 @@ int main(int argc, char** argv) {
 
   std::unique_ptr<tensorflow::tfprof::OpLog> op_log(
       new tensorflow::tfprof::OpLog());
-  if (!ReadBinaryProto(tensorflow::Env::Default(), FLAGS_op_log_path,
-                       op_log.get())
-           .ok()) {
-    op_log.release();
+  if (!FLAGS_op_log_path.empty()) {
+    tensorflow::string op_log_str;
+    s = tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                     FLAGS_op_log_path, &op_log_str);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to read op_log_path: %s\n", s.ToString().c_str());
+      return 1;
+    }
+    if (!tensorflow::ParseProtoUnlimited(op_log.get(), op_log_str)) {
+      fprintf(stderr, "Failed to parse op_log_path\n");
+      return 1;
+    }
   }
 
   std::unique_ptr<tensorflow::checkpoint::CheckpointReader> ckpt_reader;
@@ -211,10 +226,13 @@ int main(int argc, char** argv) {
       FLAGS_max_depth, FLAGS_min_bytes, FLAGS_min_micros, FLAGS_min_params,
       FLAGS_min_float_ops, device_regexes, FLAGS_order_by, account_type_regexes,
       start_name_regexes, trim_name_regexes, show_name_regexes,
-      hide_name_regexes, FLAGS_account_displayed_op_only, select, FLAGS_viz,
-      FLAGS_dump_to_file);
+      hide_name_regexes, FLAGS_account_displayed_op_only, select, output_type,
+      output_options);
 
-  if (!cmd.empty()) {
+  if (cmd == tensorflow::tfprof::kCmds[2]) {
+    tf_stat.PrintCode(opts);
+    return 0;
+  } else if (!cmd.empty()) {
     tf_stat.PrintGraph(cmd, opts);
     return 0;
   }
@@ -240,10 +258,12 @@ int main(int argc, char** argv) {
       fprintf(stderr, "E: %s\n", s.ToString().c_str());
       continue;
     }
-    if (cmd == tensorflow::tfprof::kCmds[2]) {
+    if (cmd == tensorflow::tfprof::kCmds[3]) {
       opts = new_opts;
-    } else if (cmd == tensorflow::tfprof::kCmds[3]) {
+    } else if (cmd == tensorflow::tfprof::kCmds[4]) {
       tensorflow::tfprof::PrintHelp();
+    } else if (cmd == tensorflow::tfprof::kCmds[2]) {
+      tf_stat.PrintCode(new_opts);
     } else {
       tf_stat.PrintGraph(cmd, new_opts);
     }
diff --git a/tensorflow/tools/tfprof/tfprof_options.proto b/tensorflow/tools/tfprof/tfprof_options.proto
index 0d8e6880390328586068fe57daff1f4a66fb0bc8..84a2e14005374a7fdefbb411e05ea5b7f07808f1 100644
--- a/tensorflow/tools/tfprof/tfprof_options.proto
+++ b/tensorflow/tools/tfprof/tfprof_options.proto
@@ -19,6 +19,6 @@ message OptionsProto {
   repeated string hide_name_regexes = 12;
   optional bool account_displayed_op_only = 13;
   repeated string select = 14;
-  optional bool viz = 15;
+  optional string output = 15;
   optional string dump_to_file = 16;
-}
\ No newline at end of file
+}
diff --git a/tensorflow/tools/tfprof/tfprof_output.proto b/tensorflow/tools/tfprof/tfprof_output.proto
index 9afd41046e4eb00150e0bb514d59e363d80c9d59..93e6c1233c30d008fed73f0e863c627e54d05c37 100644
--- a/tensorflow/tools/tfprof/tfprof_output.proto
+++ b/tensorflow/tools/tfprof/tfprof_output.proto
@@ -14,7 +14,8 @@ message TFProfTensorProto {
   repeated string value_str = 4;
 }
 
-message TFProfNode {
+// A node in TensorFlow graph. Used by scope/graph view.
+message TFGraphNodeProto {
   // op name.
   optional string name = 1;
   // tensor value restored from checkpoint.
@@ -30,7 +31,8 @@ message TFProfNode {
   // Number of inputs to the op.
   optional int64 inputs = 5;
   // Device the op is assigned to.
-  optional string device = 10;
+  // Since an op can fire multiple kernel calls, there can be multiple devices.
+  repeated string devices = 10;
 
   // The following are the aggregated stats from all accounted descendants and
   // the op itself. The actual descendants depend on the data structure used
@@ -45,5 +47,34 @@ message TFProfNode {
   repeated TensorShapeProto shapes = 11;
   // Descendants of the graph. The actual descendants depend on the data
   // structure used (scope, graph).
-  repeated TFProfNode children = 12;
+  repeated TFGraphNodeProto children = 12;
+}
+
+// A node in TensorFlow Python call trace stack. Used by code view.
+message TFCodeNodeProto {
+  // A trace in the trace stack.
+  optional string name = 1;
+
+  // code execution time.
+  optional int64 exec_micros = 2;
+  // Total requested bytes by the code.
+  optional int64 requested_bytes = 3;
+  // Number of parameters if available.
+  optional int64 parameters = 4;
+  // Number of float operations.
+  optional int64 float_ops = 5;
+
+  // The following are the aggregated stats from called descendents and the
+  // trace itself. The actual descendants depend on the data structure used.
+  optional int64 total_exec_micros = 6;
+  optional int64 total_requested_bytes = 7;
+  optional int64 total_parameters = 8;
+  optional int64 total_float_ops = 9;
+
+  // A set of graph nodes created by the leaf of the call stack.
+  // 'children' field should be empty if graph_nodes is non-empty.
+  repeated TFGraphNodeProto graph_nodes = 10;
+  // Descendants of the graph. The actual descendants depend on the data
+  // structure used (scope, graph).
+  repeated TFCodeNodeProto children = 11;
 }
\ No newline at end of file
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 0f69d53ba4e6c1b95d3904441c248a961b07ace0..2a206b0acd08c3a475cbd7f62bc6dc5eadef6217 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -4,10 +4,23 @@ load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library_external")
 load("//third_party/py:python_configure.bzl", "python_configure")
 
 
+def _is_windows(repository_ctx):
+  """Returns true if the host operating system is windows."""
+  return repository_ctx.os.name.lower().find("windows") != -1
+
+
+def _get_env_var(repository_ctx, name):
+  """Find an environment variable."""
+  if name in repository_ctx.os.environ:
+    return repository_ctx.os.environ[name]
+  else:
+    return None
+
+
 # Parse the bazel version string from `native.bazel_version`.
 def _parse_bazel_version(bazel_version):
   # Remove commit from version.
@@ -74,7 +87,7 @@ temp_workaround_http_archive = repository_rule(
 # Executes specified command with arguments and calls 'fail' if it exited with
 # non-zero code
 def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
-  result = repo_ctx.execute(cmd_and_args)
+  result = repo_ctx.execute(cmd_and_args, timeout=10)
   if result.return_code != 0:
     fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
           + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
@@ -84,9 +97,15 @@ def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
 # Apply a patch_file to the repository root directory
 # Runs 'patch -p1'
 def _apply_patch(repo_ctx, patch_file):
-  _execute_and_check_ret_code(repo_ctx, [
+  cmd = [
       "patch", "-p1", "-d", repo_ctx.path("."), "-i", repo_ctx.path(patch_file)
-  ])
+  ]
+  if _is_windows(repo_ctx):
+    bazel_sh = _get_env_var(repo_ctx, "BAZEL_SH")
+    if not bazel_sh:
+      fail("BAZEL_SH environment variable is not set")
+    cmd = [bazel_sh, "-c", " ".join(cmd)]
+  _execute_and_check_ret_code(repo_ctx, cmd)
 
 
 # Download the repository and apply a patch to its root
@@ -131,7 +150,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "eigen_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
+          "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
           "https://bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
       ],
       sha256 = "ca7beac153d4059c02c8fc59816c82d54ea47fe58365e8aded4082ded0b820c4",
@@ -142,7 +161,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "libxsmm_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/1.8.tar.gz",
+          "http://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.tar.gz",
           "https://github.com/hfp/libxsmm/archive/1.8.tar.gz",
       ],
       sha256 = "0330201afb5525d0950ec861fec9dd75eb40a03845ebe03d2c635cf8bfc14fea",
@@ -158,7 +177,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "ortools_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          "http://mirror.bazel.build/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
           "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
       ],
       sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
@@ -169,7 +188,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "com_googlesource_code_re2",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
+          "http://mirror.bazel.build/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
           "https://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
       ],
       sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
@@ -179,7 +198,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "gemmlowp",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
+          "http://mirror.bazel.build/github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
           "https://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
       ],
       sha256 = "75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26",
@@ -189,7 +208,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "farmhash_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
+          "http://mirror.bazel.build/github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
           "https://github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
       ],
       sha256 = "4c626d1f306bda2c6804ab955892f803f5245f4dcaecb4979dc08b091256da54",
@@ -205,7 +224,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "highwayhash",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "http://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
           "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
       ],
       sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
@@ -216,7 +235,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "nasm",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
+          "http://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
           "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.12.02.tar.bz2/d15843c3fb7db39af80571ee27ec6fad/nasm-2.12.02.tar.bz2",
       ],
       sha256 = "00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324",
@@ -227,7 +246,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   temp_workaround_http_archive(
       name = "jpeg",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
+          "http://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
           "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
       ],
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
@@ -239,7 +258,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "png_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/glennrp/libpng/archive/v1.2.53.zip",
+          "http://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.zip",
           "https://github.com/glennrp/libpng/archive/v1.2.53.zip",
       ],
       sha256 = "c35bcc6387495ee6e757507a68ba036d38ad05b415c2553b3debe2a57647a692",
@@ -250,7 +269,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "gif_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
+          "http://mirror.bazel.build/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
           "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
           "http://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
       ],
@@ -262,7 +281,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "six_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
+          "http://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
           "http://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
       ],
       sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
@@ -273,7 +292,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "org_pythonhosted_markdown",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
+          "http://mirror.bazel.build/pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
           "https://pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
       ],
       strip_prefix = "Markdown-2.6.8",
@@ -284,7 +303,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "org_html5lib",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/html5lib/html5lib-python/archive/0.9999999.tar.gz",
+          "http://mirror.bazel.build/github.com/html5lib/html5lib-python/archive/0.9999999.tar.gz",
           "https://github.com/html5lib/html5lib-python/archive/0.9999999.tar.gz",  # identical to 1.0b8
       ],
       sha256 = "184257f98539159a433e2a2197309657ae1283b4c44dbd9c87b2f02ff36adce8",
@@ -295,7 +314,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "org_mozilla_bleach",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/mozilla/bleach/archive/v1.5.tar.gz",
+          "http://mirror.bazel.build/github.com/mozilla/bleach/archive/v1.5.tar.gz",
           "https://github.com/mozilla/bleach/archive/v1.5.tar.gz",
       ],
       strip_prefix = "bleach-1.5",
@@ -306,7 +325,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "org_pocoo_werkzeug",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
+          "http://mirror.bazel.build/pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
           "https://pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
       ],
       strip_prefix = "Werkzeug-0.11.10",
@@ -322,7 +341,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   patched_http_archive(
       name = "protobuf",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
           "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
       sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
@@ -340,7 +359,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "com_google_protobuf",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
           "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
       sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
@@ -350,7 +369,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
           "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
       sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
@@ -360,7 +379,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "gmock_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/googletest/archive/release-1.8.0.zip",
+          "http://mirror.bazel.build/github.com/google/googletest/archive/release-1.8.0.zip",
           "https://github.com/google/googletest/archive/release-1.8.0.zip",
       ],
       sha256 = "f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf",
@@ -381,7 +400,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "com_github_gflags_gflags",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
+          "http://mirror.bazel.build/github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
           "https://github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
       ],
       sha256 = "4d222fab8f1ede4709cdff417d15a1336f862d7334a81abf76d09c15ecf9acd1",
@@ -397,7 +416,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "pcre",
       sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
+          "http://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
           "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
       ],
       strip_prefix = "pcre-8.39",
@@ -408,7 +427,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "swig",
       sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
+          "http://mirror.bazel.build/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
           "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
           "http://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
       ],
@@ -420,7 +439,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "curl",
       sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/curl.haxx.se/download/curl-7.49.1.tar.gz",
+          "http://mirror.bazel.build/curl.haxx.se/download/curl-7.49.1.tar.gz",
           "https://curl.haxx.se/download/curl-7.49.1.tar.gz",
       ],
       strip_prefix = "curl-7.49.1",
@@ -443,7 +462,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "grpc",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
+          "http://mirror.bazel.build/github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
           "https://github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
       ],
       sha256 = "a15f352436ab92c521b1ac11e729e155ace38d0856380cf25048c5d1d9ba8e31",
@@ -467,7 +486,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "linenoise",
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
+          "http://mirror.bazel.build/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
           "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
       ],
       strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
@@ -479,11 +498,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   temp_workaround_http_archive(
       name = "llvm",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/llvm-mirror/llvm/archive/8a1f075c93565dd665a10ac38490f644b2c02037.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/8a1f075c93565dd665a10ac38490f644b2c02037.tar.gz",
+          "http://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/c978c0ff91f7c4ea58cfbd8f378e51c6af2c2b4b.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/c978c0ff91f7c4ea58cfbd8f378e51c6af2c2b4b.tar.gz",
       ],
-      sha256 = "d9ebd0b49544f3b20ee2a412aac18ed8899b8eef376343a6ba8e179563cbfd86",
-      strip_prefix = "llvm-8a1f075c93565dd665a10ac38490f644b2c02037",
+      sha256 = "42c57d798a037d9dea692ce1da8ff4d24966ab5a40494015b374341e43411a37",
+      strip_prefix = "llvm-c978c0ff91f7c4ea58cfbd8f378e51c6af2c2b4b",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
       repository = tf_repo_name,
   )
@@ -491,7 +510,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "jsoncpp_git",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
+          "http://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
           "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
       ],
       sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
@@ -507,7 +526,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "boringssl",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",
+          "http://mirror.bazel.build/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",
           "https://github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",  # 2016-07-11
       ],
       sha256 = "025264d6e9a7ad371f2f66d17a28b6627de0c9592dc2eb54afd062f68f1f9aa3",
@@ -517,7 +536,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "nanopb_git",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
+          "http://mirror.bazel.build/github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
           "https://github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
       ],
       sha256 = "ab1455c8edff855f4f55b68480991559e51c11e7dab060bbab7cffb12dd3af33",
@@ -533,7 +552,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "zlib_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/zlib.net/zlib-1.2.8.tar.gz",
+          "http://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
           "http://zlib.net/fossils/zlib-1.2.8.tar.gz",
       ],
       sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
@@ -549,7 +568,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "fft2d",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
+          "http://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
           "http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
       ],
       sha256 = "52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296",
@@ -559,7 +578,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   temp_workaround_http_archive(
       name = "snappy",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/snappy/archive/1.1.4.zip",
+          "http://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.zip",
           "https://github.com/google/snappy/archive/1.1.4.zip",
       ],
       sha256 = "6c74d2b663170d68184da353cdd71b5b7d57bc8888ef1e99b4929b5d680dba54",
@@ -571,7 +590,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   temp_workaround_http_archive(
       name = "nccl_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz",
+          "http://mirror.bazel.build/github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz",
           "https://github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz",
       ],
       sha256 = "6c34a0862d9f8ed4ad5984c6a8206b351957bb14cf6ad7822720f285f4aada04",
@@ -584,7 +603,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "junit",
       jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
       jar_urls = [
-          "http://bazel-mirror.storage.googleapis.com/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
+          "http://mirror.bazel.build/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
           "http://repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
           "http://maven.ibiblio.org/maven2/junit/junit/4.12/junit-4.12.jar",
       ],
@@ -597,7 +616,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "org_hamcrest_core",
       jar_sha256 = "66fdef91e9739348df7a096aa384a5685f4e875584cce89386a7a47251c4d8e9",
       jar_urls = [
-          "http://bazel-mirror.storage.googleapis.com/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
+          "http://mirror.bazel.build/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
           "http://repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
           "http://maven.ibiblio.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
       ],
@@ -608,7 +627,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   temp_workaround_http_archive(
       name = "jemalloc",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+          "http://mirror.bazel.build/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
           "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
       ],
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
@@ -617,6 +636,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       repository = tf_repo_name,
   )
 
+  native.new_http_archive(
+      name = "com_google_pprof",
+      urls = [
+          "http://mirror.bazel.build/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
+          "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
+      ],
+      sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
+      strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
+      build_file = str(Label("//third_party:pprof.BUILD")),
+  )
+
   ##############################################################################
   # TensorBoard Build Tools
 
@@ -634,23 +664,23 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       licenses = ["notice"],
       sha256_urls_extract_macos = {
           "47109a00cac344d80296c195451bb5eee7c21727fcef1594384ddfe1f852957a": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
               "http://nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
           ],
       },
       sha256_urls_windows = {
           "606c44c42d17866c017c50c0afadad411d9492ac4281d2431b937f881911614e": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/win-x64/node.exe",
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/win-x64/node.exe",
               "http://nodejs.org/dist/v4.3.2/win-x64/node.exe",
           ],
           "451a40570099a95488d6438f175813629e0430f87f23c8659bc18dc42494820a": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/win-x64/node.lib",
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/win-x64/node.lib",
               "http://nodejs.org/dist/v4.3.2/win-x64/node.lib",
           ],
       },
       sha256_urls_extract = {
           "4350d0431b49697517c6cca5d66adf5f74eb9101c52f52ae959fa94225822d44": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
               "http://nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
           ],
       },
@@ -668,13 +698,13 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "com_microsoft_typescript",
       licenses = ["notice"],  # Apache 2.0
       sha256_urls = {
-          "43a7c763fe024d5add8d5365e5a7981f4a359ba5bf86481f545a0db8f60d48cc": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/Microsoft/TypeScript/v2.2.2/lib/tsc.js",
-              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.2.2/lib/tsc.js",
+          "8465342c318f9c4cf0a29b109fa63ee3742dd4dc7080d05d9fd8f604814d04cf": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/tsc.js",
+              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/tsc.js",
           ],
-          "aecec1e47a3b3d872e214cb9adb82b30d6bd0471ea0aad7311ad81428566627c": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/Microsoft/TypeScript/v2.2.2/lib/lib.es6.d.ts",
-              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.2.2/lib/lib.es6.d.ts",
+          "a67e36da3029d232e4e938e61a0a3302f516d71e7100d54dbf5362ad8618e994": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/lib.es6.d.ts",
+              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/lib.es6.d.ts",
           ],
       },
       extra_build_file_content = "\n".join([
@@ -706,15 +736,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   ##############################################################################
   # TensorBoard JavaScript Production Dependencies
 
-  filegroup_external(
+  web_library_external(
       name = "com_lodash",
       licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "7c7b391810bc08cf815683431857c51b5ee190062ae4f557e1e4689d6dd910ea": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/lodash/lodash/3.8.0/lodash.js",
-              "https://raw.githubusercontent.com/lodash/lodash/3.8.0/lodash.js",
-          ],
-      },
+      sha256 = "0e88207e5f90af4ce8790d6e1e7d09d2702d81bce0bafdc253d18c0a5bf7661e",
+      urls = [
+          "http://mirror.bazel.build/github.com/lodash/lodash/archive/3.10.1.tar.gz",
+          "https://github.com/lodash/lodash/archive/3.10.1.tar.gz",
+      ],
+      strip_prefix = "lodash-3.10.1",
+      path = "/lodash",
+      srcs = ["lodash.js"],
   )
 
   filegroup_external(
@@ -723,7 +755,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       licenses = ["notice"],  # MIT
       sha256_urls = {
           "dfaca3b8485bee735788cc6eebca82ea25719adc1fb8911c7799c6bd5a95df3b": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
               "https://raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
           ],
       },
@@ -735,27 +767,43 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       licenses = ["notice"],  # MIT
       sha256_urls = {
           "77510d7538dbd3b59f1c8a06f68131b38562e3be546364747618d5112723e818": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
+              "http://mirror.bazel.build/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
               "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
           ],
           "cd46dc709b01cd361e8399f797760871a6a207bc832e08fcff385ced02ef2b43": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.d.ts",
               "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.d.ts",
           ],
           "32647b0fb4175fa875a71e6d56c761b88d975186ed6a8820e2c7854165a8988d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
               "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
           ],
       },
   )
 
+  # TODO: Delete previous rule and rename this one org_palantir_plottable
+  filegroup_external(
+      name = "com_palantir_plottable_v3",
+      # no @license header
+      licenses = ["notice"],  # MIT
+      sha256_urls_extract = {
+          # Plottable doesn't have a release tarball on GitHub. Using the
+          # sources directly from git also requires running Node tooling
+          # beforehand to generate files. NPM is the only place to get it.
+          "e3159beb279391c47433789f22b32bac88488cfcad6c0b6ec8605ce6b0081b0d": [
+              "http://mirror.bazel.build/registry.npmjs.org/plottable/-/plottable-3.1.0.tgz",
+              "https://registry.npmjs.org/plottable/-/plottable-3.1.0.tgz",
+          ],
+      },
+  )
+
   filegroup_external(
       name = "io_github_cpettitt_dagre",
       # no @license header
       licenses = ["notice"],  # MIT
       sha256_urls = {
           "7323829ddd77924a69e2b1235ded3eac30acd990da0f037e0fbd3c8e9035b50d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
               "https://raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
           ],
       },
@@ -763,11 +811,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
 
   filegroup_external(
       name = "io_github_cpettitt_graphlib",
-      # no @license header
       licenses = ["notice"],  # MIT
       sha256_urls = {
           "772045d412b1513b549be991c2e1846c38019429d43974efcae943fbe83489bf": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
               "https://raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
           ],
       },
@@ -779,7 +826,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       licenses = ["notice"],  # MIT
       sha256_urls = {
           "f138fce57f673ca8a633f4aee5ae5b6fcb6ad0de59069a42a74e996fd04d8fcc": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
               "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
           ],
       },
@@ -791,82 +838,392 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       licenses = ["notice"],  # BSD-3-Clause
       sha256_urls = {
           "bc1e38838f5c5c8e040132d41efee6bfddbef728210bd566479dc1694af1d3f5": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
               "https://raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
           ],
       },
   )
 
+  # TODO: Delete previous rule and rename this one org_d3js
+  filegroup_external(
+      name = "org_d3js_v4",
+      # no @license header
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256_urls_extract = {
+          "b5fac5b296bc196e6aa7b59f9e33986fc44d23d59a0e211705187be9e35b943d": [
+              "http://mirror.bazel.build/github.com/d3/d3/releases/download/v4.8.0/d3.zip",
+              "https://github.com/d3/d3/releases/download/v4.8.0/d3.zip",
+          ],
+      },
+      # TODO(jart): Use srcs=["d3.js"] instead of this once supported.
+      generated_rule_name = "all_files",
+      extra_build_file_content = "\n".join([
+          "filegroup(",
+          "    name = \"org_d3js_v4\",",
+          "    srcs = [\"d3.js\"],",
+          ")",
+      ]),
+  )
+
   filegroup_external(
       name = "org_definitelytyped",
       licenses = ["notice"],  # MIT
       sha256_urls = {
           "b7da645f6e5555feb7aeede73775da0023ce2257df9c8e76c9159266035a9c0d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
           ],
           "177293828c7a206bf2a7f725753d51396d38668311aa37c96445f91bbf8128a7": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",  # v3
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",  # v3
           ],
           "e4cd3d5de0eb3bc7b1063b50d336764a0ac82a658b39b5cf90511f489ffdee60": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
           ],
           "695a03dd2ccb238161d97160b239ab841562710e5c4e42886aefd4ace2ce152e": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
           ],
+          "513ccd9ee1c708881120eeacd56788fc3b3da8e5c6172b20324cebbe858803fe": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/708609e0764daeb5eb64104af7aca50c520c4e6e/sinon/sinon.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/708609e0764daeb5eb64104af7aca50c520c4e6e/sinon/sinon.d.ts",
+          ],
           "44eba36339bd1c0792072b7b204ee926fe5ffe1e9e2da916e67ac55548e3668a": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/a872802c0c84ba98ff207d5e673a1fa867c67fd6/polymer/polymer.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/a872802c0c84ba98ff207d5e673a1fa867c67fd6/polymer/polymer.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/a872802c0c84ba98ff207d5e673a1fa867c67fd6/polymer/polymer.d.ts",
           ],
+          "9453c3e6bae824e90758c3b38975c1ed77e6abd79bf513bcb08368fcdb14898e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/f5407eba29c04fb8387c86df27512bd055b195d2/threejs/three.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/f5407eba29c04fb8387c86df27512bd055b195d2/threejs/three.d.ts",
+          ],
           "691756a6eb455f340c9e834de0d49fff269e7b8c1799c2454465dcd6a4435b80": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/46719185c564694c5583c4b7ad94dbb786ecad46/webcomponents.js/webcomponents.js.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/46719185c564694c5583c4b7ad94dbb786ecad46/webcomponents.js/webcomponents.js.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/46719185c564694c5583c4b7ad94dbb786ecad46/webcomponents.js/webcomponents.js.d.ts",
           ],
       },
   )
 
   filegroup_external(
-      name = "org_threejs",
-      # no @license header
+      name = "org_definitelytyped_types_d3_array",
       licenses = ["notice"],  # MIT
       sha256_urls = {
-          "7aff264bd84c90bed3c72a4dc31db8c19151853c6df6980f52b01d3e9872c82d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
-              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
+          "61e7abb7b1f01fbcb0cab8cf39003392f422566209edd681fbd070eaa84ca000": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-array/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-array/index.d.ts",
           ],
-          "0e98ded15bb7fe398a655667e76b39909d36c0973a8950d01c62f65f93161c27": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
-              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_axis",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "95f75c8dcc89850b2e72581d96a7b5f46ea4ac852f828893f141f14a597421f9": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-axis/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-axis/index.d.ts",
           ],
       },
   )
 
-  ##############################################################################
-  # TensorBoard JavaScript Testing Dependencies
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_brush",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a2738e693ce8a8640c2d29001e77582c9c361fd23bda44db471629866b60ada7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-brush/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-brush/index.d.ts",
+          ],
+      },
+  )
 
   filegroup_external(
-      name = "com_chaijs",
-      # no @license header
+      name = "org_definitelytyped_types_d3_chord",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "c54d24756eb6d744b31e538ad9bab3a75f6d54e2288b29cc72338d4a057d3e83": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-chord/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-chord/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_collection",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "f987667167b1d2970911247e325eb1c37ca0823646f81ccec837ae59039822f7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-collection/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-collection/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_color",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "9580c81f38ddcce7be0ac9bd3d0d083adebc34e17441709f90b9e4dcd1c19a56": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-color/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-color/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_dispatch",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "169f80b4cceca8e2e9ed384d81a5db0624cc01a26451dfb5a7e0cec6ea9cfb06": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dispatch/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dispatch/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_drag",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "08d35d139dde58c2722be98d718d01204fd6167d310f09b379e832f3c741489d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-drag/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-drag/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_dsv",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "62594d00cf9e4bb895339c8e56f64330e202a5eb2a0fa580a1f6e6336f2c93ce": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dsv/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dsv/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_ease",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "d1cf8f99b7bf758c2ba3c0a4ce553e151d4d9b4cf45a6e8bd0edec7ce90f725b": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-ease/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-ease/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_force",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "288421e2008668d2076a4684657dd3d29b992832ef02c552981eb94a91042553": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-force/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-force/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_format",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "b42cb17e580c1fd0b64d478f7bd80ca806efaefda24426a833cf1f30a7275bca": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-format/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-format/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_hierarchy",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a5683f5835d8716c6b89c075235078438cfab5897023ed720bfa492e244e969e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-hierarchy/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-hierarchy/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_interpolate",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "590a71b741323ac3139b333ec8b743e24717fdd5b32bcff48ee521162a9dfe1c": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-interpolate/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-interpolate/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_path",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "96f35ba041bcaa265e2b373ee675177410d44d31c980e4f7fbeefd4bcba15b00": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-path/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-path/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_polygon",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "ce453451e8105cac6a4f4a4263ca2142ebb4bf442e342f470a81da691f220fcb": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-polygon/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-polygon/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_quadtree",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "238e278f1be5d6985a19800800cffee80f81199f71d848e3bbc288d1791a6f90": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-quadtree/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-quadtree/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_queue",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "e6ae19aad83495475653578de64fb9d6bf9764eda6c84d70f7935ec84bcc482e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-queue/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-queue/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_random",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "d31b92ed86c23ec0a4776f99fa81ff033c95b96c8304d8aa9baf3b94af779aa8": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-random/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-random/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_request",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "44bb7b07d977028e6567540a3303b06fc9b33fb0960bc75c520e0733c840d89f": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-request/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-request/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_scale",
       licenses = ["notice"],  # MIT
       sha256_urls = {
-          "b926b325ad9843bf0b7a6d580ef78bb560e47c484b98680098d4fd9b31b77cd9": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/chaijs/chai/2.3.0/chai.js",
-              "https://raw.githubusercontent.com/chaijs/chai/2.3.0/chai.js",
+          "02ce7c644ba34bd1abb84da2e832f248b048b6a23812be4365bd837f186c9f1f": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-scale/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-scale/index.d.ts",
           ],
       },
   )
 
   filegroup_external(
-      name = "org_mochajs",
+      name = "org_definitelytyped_types_d3_selection",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "699043ddb28dfa5e46d87bc6a24cfc6d604237f298259d3fb3c7066e05e8c86e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-selection/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-selection/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_shape",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "62668a7aaaf6232762b544f9f89c0f557ca7cfb0cd343a358dda7ecbe26f5739": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-shape/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-shape/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_time",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "0502490ce682fd9265fb1d5d693ce6cd82e3b05e5f5ee3433731266ecb03d5fc": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-time/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-time/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_timer",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "6f191f9aea704aa64b1defa40dfdff1447a6e6bb815feff1660f894500a9c94d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-timer/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-timer/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_transition",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a0a7c0c9bfb5c7d6d9d22a8d16b4484b66d13f2ed226954037546cb3da4098ba": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-transition/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-transition/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_voronoi",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "c6bd5f229f915151d0ef678fe50b1aa6a62334ea0a8c6fc0effbac9f7032efc7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-voronoi/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-voronoi/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_zoom",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a25dc17fbd304cf7a0e5e7bbb8339c930d464eb40c4d6e5f839ce9c0191f4110": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-zoom/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-zoom/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_threejs",
       # no @license header
       licenses = ["notice"],  # MIT
       sha256_urls = {
-          "e36d865a17ffdf5868e55e736526ae30f3d4bc667c85a2a28cd5c850a82361e2": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mochajs/mocha/2.3.4/mocha.js",
-              "https://raw.githubusercontent.com/mochajs/mocha/2.3.4/mocha.js",
+          "7aff264bd84c90bed3c72a4dc31db8c19151853c6df6980f52b01d3e9872c82d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
+              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
+          ],
+          "0e98ded15bb7fe398a655667e76b39909d36c0973a8950d01c62f65f93161c27": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
+              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
           ],
       },
   )
@@ -874,12 +1231,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   ##############################################################################
   # TensorBoard Polymer Dependencies
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_font_roboto",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "fae51429b56a4a4c15f1f0c23b733c7095940cc9c04c275fa7adb3bf055b23b3",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
           "https://github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
       ],
       strip_prefix = "font-roboto-1.0.1",
@@ -887,12 +1244,30 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       srcs = ["roboto.html"],
   )
 
-  webfiles_external(
+  web_library_external(
+      name = "org_polymer_hydrolysis",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "703b50f6b00f9e0546b5a3451da57bb20f77a166e27e4967923b9e835bab9b80",
+      urls = [
+          "http://mirror.bazel.build/github.com/Polymer/polymer-analyzer/archive/v1.19.3.tar.gz",
+          "https://github.com/Polymer/polymer-analyzer/archive/v1.19.3.tar.gz",
+      ],
+      strip_prefix = "polymer-analyzer-1.19.3",
+      path = "/hydrolysis",
+      srcs = [
+          "hydrolysis-analyzer.html",
+          "hydrolysis.html",
+          "hydrolysis.js",
+      ],
+      deps = ["@org_polymer"],
+  )
+
+  web_library_external(
       name = "org_polymer_iron_a11y_announcer",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6bce143db7a374a68535ec8b861a5f30e81f2f1e4ee36a55bda2a891f6fd2818",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
           "https://github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
       ],
       strip_prefix = "iron-a11y-announcer-1.0.5",
@@ -901,12 +1276,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_a11y_keys_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6823efc47a83208fd51d39c5a1d3eb0c0bebc705df1ce01310509da22a13ebd2",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
           "https://github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
       ],
       strip_prefix = "iron-a11y-keys-behavior-1.1.8",
@@ -915,12 +1290,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_ajax",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "9162d8af4611e911ac3ebbfc08bb7038ac04f6e79a9287b1476fe36ad6770bc5",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
           "https://github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
       ],
       strip_prefix = "iron-ajax-1.2.0",
@@ -935,12 +1310,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_autogrow_textarea",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "50bbb901d2c8f87462e3552e3d671a552faa12c37c485e548d7a234ebffbc427",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
           "https://github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
       ],
       strip_prefix = "iron-autogrow-textarea-1.0.12",
@@ -955,12 +1330,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_behaviors",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a1e8d4b7a13f3d36beba9c2a6b186ed33a53e6af2e79f98c1fcc7e85e7b53f89",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
           "https://github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
       ],
       strip_prefix = "iron-behaviors-1.0.17",
@@ -975,12 +1350,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_checked_element_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "539a0e1c4df0bc702d3bd342388e4e56c77ec4c2066cce69e41426a69f92e8bd",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
           "https://github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
       ],
       strip_prefix = "iron-checked-element-behavior-1.0.4",
@@ -993,12 +1368,37 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
+      name = "org_polymer_iron_component_page",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "3636e8b9a1f229fc33b5aad3933bd02a9825f66e679a0be31855d7c8245c4b4b",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-component-page/archive/v1.1.4.tar.gz",
+          "https://github.com/PolymerElements/iron-component-page/archive/v1.1.4.tar.gz",
+      ],
+      strip_prefix = "iron-component-page-1.1.4",
+      path = "/iron-component-page",
+      srcs = ["iron-component-page.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_hydrolysis",
+          "@org_polymer_iron_ajax",
+          "@org_polymer_iron_doc_viewer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_icons",
+          "@org_polymer_iron_selector",
+          "@org_polymer_paper_header_panel",
+          "@org_polymer_paper_styles",
+          "@org_polymer_paper_toolbar",
+      ],
+  )
+
+  web_library_external(
       name = "org_polymer_iron_collapse",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "275808994a609a2f9923e2dd2db1957945ab141ba840eadc33f19e1f406d600e",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
           "https://github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
       ],
       strip_prefix = "iron-collapse-1.0.8",
@@ -1010,12 +1410,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_demo_helpers",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "aa7458492a6ac3d1f6344640a4c2ab07bce64e7ad0422b83b5d665707598cce6",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
           "https://github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
       ],
       strip_prefix = "iron-demo-helpers-1.1.0",
@@ -1035,12 +1435,37 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
+      name = "org_polymer_iron_doc_viewer",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "f0e9dfbbcd94d7e88ce82cb61e615406ace63c185fee9396f7f182206ca5cc9a",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-doc-viewer/archive/v1.0.12.tar.gz",
+          "https://github.com/PolymerElements/iron-doc-viewer/archive/v1.0.12.tar.gz",
+      ],
+      strip_prefix = "iron-doc-viewer-1.0.12",
+      path = "/iron-doc-viewer",
+      srcs = [
+          "iron-doc-property-styles.html",
+          "iron-doc-property.html",
+          "iron-doc-viewer-styles.html",
+          "iron-doc-viewer.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_marked_element",
+          "@org_polymer_paper_button",
+          "@org_polymer_paper_styles",
+          "@org_polymer_prism_element",
+      ],
+  )
+
+  web_library_external(
       name = "org_polymer_iron_dropdown",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "f7e4a31d096d10d8af1920397695cb17f3eb1cbe5e5ff91a861dabfcc085f376",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
           "https://github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
       ],
       strip_prefix = "iron-dropdown-1.4.0",
@@ -1059,12 +1484,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_fit_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "10132a2ea309a37c4c07b8fead71f64abc588ee6107931e34680f5f36dd8291e",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
           "https://github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
       ],
       strip_prefix = "iron-fit-behavior-1.2.5",
@@ -1073,12 +1498,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_flex_layout",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "79287f6ca1c2d4e003f68b88fe19d03a1b6a0011e2b4cae579fe4d1474163a2e",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
           "https://github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
       ],
       strip_prefix = "iron-flex-layout-1.3.0",
@@ -1092,12 +1517,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_form_element_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "1dd9371c638e5bc2ecba8a64074aa680dfb8712198e9612f9ed24d387efc8f26",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
           "https://github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
       ],
       strip_prefix = "iron-form-element-behavior-1.0.6",
@@ -1106,12 +1531,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_icon",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "9ed58a69159a02c07a6050d242e6d4e585a29f3245b8c8c390cfd52ddb786dc4",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
           "https://github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
       ],
       strip_prefix = "iron-icon-1.0.11",
@@ -1124,12 +1549,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_icons",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "3b18542c147c7923dc3a36b1a51984a73255d610f297d43c9aaccc52859bd0d0",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
           "https://github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
       ],
       strip_prefix = "iron-icons-1.1.3",
@@ -1153,12 +1578,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_iconset_svg",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "7e3925b7e63a7d22524c4b43ce16ab80d06a576649644783643c11a003284368",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
           "https://github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
       ],
       strip_prefix = "iron-iconset-svg-1.1.0",
@@ -1170,12 +1595,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_input",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "c505101ead08ab25526b1f49baecc8c28b4221b92a65e7334c783bdc81553c36",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
           "https://github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
       ],
       strip_prefix = "iron-input-1.0.10",
@@ -1188,12 +1613,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_list",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "72a6530b9f0ad5557f5d287845792a0ada74d8b159198e27f940e226313dc116",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
           "https://github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
       ],
       strip_prefix = "iron-list-1.3.9",
@@ -1207,12 +1632,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_menu_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "ad27889343bc9a709258b073f69abc028bb1ffd3fdb975cd2d3939f7f5d7bb6c",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
           "https://github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
       ],
       strip_prefix = "iron-menu-behavior-1.1.10",
@@ -1228,12 +1653,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_meta",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "fb05e6031bae6b4effe5f15d44b3f548d5807f9e3b3aa2442ba17cf4b8b84361",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
           "https://github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
       ],
       strip_prefix = "iron-meta-1.1.1",
@@ -1242,12 +1667,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_overlay_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "3df5b54ff2e0510c87a2aff8c9d730d3fe83d3d11277cc1a49fa29b549acb46c",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
           "https://github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
       ],
       strip_prefix = "iron-overlay-behavior-1.10.1",
@@ -1266,12 +1691,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_range_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "b2f2b6d52284542330bd30b586e217926eb0adec5e13934a3cef557717c22dc2",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
           "https://github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
       ],
       strip_prefix = "iron-range-behavior-1.0.4",
@@ -1280,12 +1705,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_resizable_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a87a78ee9223c2f6afae7fc94a3ff91cbce6f7e2a7ed3f2979af7945c9281616",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
           "https://github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
       ],
       strip_prefix = "iron-resizable-behavior-1.0.3",
@@ -1294,12 +1719,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_scroll_target_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "d0de0c804b1ec91d814754144afd9da1cdb082690de88bd5e47fd5f41990746f",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
           "https://github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
       ],
       strip_prefix = "iron-scroll-target-behavior-1.0.3",
@@ -1308,12 +1733,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_selector",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "ba28a47443bad3b744611c9d7a79fb21dbdf2e35edc5ef8f812e2dcd72b16747",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
           "https://github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
       ],
       strip_prefix = "iron-selector-1.5.2",
@@ -1327,12 +1752,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_validatable_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "aef4901e68043824f36104799269573dd345ffaac494186e466fdc79c06fdb63",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
           "https://github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
       ],
       strip_prefix = "iron-validatable-behavior-1.1.1",
@@ -1344,12 +1769,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_marked",
       licenses = ["notice"],  # MIT
       sha256 = "93d30bd593736ca440938d77808b7ef5972da0f3fcfe4ae63ae7b4ce117da2cb",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/chjj/marked/archive/v0.3.2.zip",
+          "http://mirror.bazel.build/github.com/chjj/marked/archive/v0.3.2.zip",
           "https://github.com/chjj/marked/archive/v0.3.2.zip",
       ],
       strip_prefix = "marked-0.3.2",
@@ -1357,12 +1782,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       srcs = ["lib/marked.js"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_marked_element",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "7547616df95f8b903757e6afbabfcdba5322c2bcec3f17c726b8bba5adf4bc5f",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
           "https://github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
       ],
       strip_prefix = "marked-element-1.1.3",
@@ -1377,12 +1802,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_neon_animation",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "8800c314a76b2da190a2b203259c1091f6d38e0057ed37c2a3d0b734980fa9a5",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
           "https://github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
       ],
       strip_prefix = "neon-animation-1.2.2",
@@ -1426,12 +1851,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_behaviors",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "7cfcb9082ef9909da262df6b5c120bc62dbeaff278cb563e8fc60465ddd387e5",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
           "https://github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
       ],
       strip_prefix = "paper-behaviors-1.0.12",
@@ -1450,12 +1875,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "896c0a7e34bfcce63fc23c63e105ed9c4d62fa3a6385b7161e1e5cd4058820a6",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
           "https://github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
       ],
       strip_prefix = "paper-button-1.0.11",
@@ -1470,12 +1895,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_checkbox",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6828a6954a048b1230fbd2606faffbae950ba1d042175b96ec50ae355786a166",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
           "https://github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
       ],
       strip_prefix = "paper-checkbox-1.4.0",
@@ -1488,12 +1913,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_dialog",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "c6a9709e7f528d03dcd574503c18b72d4751ca30017346d16e6a791d37ed9259",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
           "https://github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
       ],
       strip_prefix = "paper-dialog-1.0.4",
@@ -1506,12 +1931,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_dialog_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a7e0e27ce63554bc14f384cf94bcfa24da8dc5f5120dfd565f45e166261aee40",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
           "https://github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
       ],
       strip_prefix = "paper-dialog-behavior-1.2.5",
@@ -1521,7 +1946,6 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "paper-dialog-common.css",
           "paper-dialog-shared-styles.html",
       ],
-      suppress = ["cssSyntax"],
       deps = [
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
@@ -1530,12 +1954,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_dialog_scrollable",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a2e69283e7674f782c44d811387a0f8da2d01fac0172743d1add65e253e6b5ff",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
           "https://github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
       ],
       strip_prefix = "paper-dialog-scrollable-1.1.5",
@@ -1549,12 +1973,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_dropdown_menu",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "9d88f654ec03ee9be211df9e69bede9e8a22b51bf1dbcc63b79762e4256d81ad",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
           "https://github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
       ],
       strip_prefix = "paper-dropdown-menu-1.4.0",
@@ -1581,12 +2005,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_header_panel",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "0db4bd8a4bf6f20dcd0dffb4f907b31c93a8647c9c021344239cf30b40b87075",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
       ],
       strip_prefix = "paper-header-panel-1.1.4",
@@ -1598,12 +2022,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_icon_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "9cba5bcfd6aeb4c41581c1392c678cf2278d360e9d122f4d9db54a9ebb404496",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
           "https://github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
       ],
       strip_prefix = "paper-icon-button-1.1.3",
@@ -1620,12 +2044,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_input",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "17c3dea9bb1c2026cc61324696c6c774214a0dc37686b91ca214a6af550994db",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
           "https://github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
       ],
       strip_prefix = "paper-input-1.1.18",
@@ -1651,12 +2075,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_item",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "12ee0dcb61b0d5721c5988571f6974d7b2211e97724f4195893fbcc9058cdac8",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
       ],
       strip_prefix = "paper-item-1.1.4",
@@ -1676,12 +2100,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_listbox",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "3cb35f4fe9a3f15185a9e91711dba8f27e9291c8cd371ebf1be21b8f1d5f65fb",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
           "https://github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
       ],
       strip_prefix = "paper-listbox-1.1.2",
@@ -1694,12 +2118,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_material",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "09f6c8bd6ddbea2be541dc86306efe41cdfb31bec0b69d35a5dc29772bbc8506",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
           "https://github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
       ],
       strip_prefix = "paper-material-1.0.6",
@@ -1714,12 +2138,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_menu",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a3cee220926e315f7412236b3628288774694447c0da4428345f36d0f127ba3b",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
           "https://github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
       ],
       strip_prefix = "paper-menu-1.2.2",
@@ -1739,12 +2163,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_menu_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "be3290c288a2bd4f9887213db22c75add99cc29ff4d088100c0bc4eb0e57997b",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
           "https://github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
       ],
       strip_prefix = "paper-menu-button-1.5.1",
@@ -1763,12 +2187,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_progress",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "2b6776b2f023c1f344feea17ba29b58d879e46f8ed43b7256495054b5183fff6",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
           "https://github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
       ],
       strip_prefix = "paper-progress-1.0.9",
@@ -1782,12 +2206,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_radio_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6e911d0c308aa388136b3af79d1bdcbe5a1f4159cbc79d71efb4ff3b6c0b4e91",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
           "https://github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
       ],
       strip_prefix = "paper-radio-button-1.1.2",
@@ -1800,12 +2224,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_radio_group",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "7885ad1f81e9dcc03dcea4139b54a201ff55c18543770cd44f94530046c9e163",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
           "https://github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
       ],
       strip_prefix = "paper-radio-group-1.0.9",
@@ -1819,12 +2243,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_ripple",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "ba76bfb1c737260a8a103d3ca97faa1f7c3288c7db9b2519f401b7a782147c09",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
           "https://github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
       ],
       strip_prefix = "paper-ripple-1.0.5",
@@ -1836,12 +2260,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_slider",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "08e7c541dbf5d2e959208810bfc03188e82ced87e4d30d325172967f67962c3c",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
           "https://github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
       ],
       strip_prefix = "paper-slider-1.0.10",
@@ -1860,12 +2284,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_spinner",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6a752907fab7899cbeed15b478e7b9299047c15fbf9d1561d6eb4d204bdbd178",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
           "https://github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
       ],
       strip_prefix = "paper-spinner-1.1.1",
@@ -1881,12 +2305,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_styles",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6d26b0a4c286402098853dc7388f6b22f30dfb7a74e47b34992ac03380144bb2",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
       ],
       strip_prefix = "paper-styles-1.1.4",
@@ -1912,12 +2336,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_tabs",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "c23b6a5221db35e5b1ed3eb8e8696b952572563e285adaec96aba1e3134db825",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
           "https://github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
       ],
       strip_prefix = "paper-tabs-1.7.0",
@@ -1941,12 +2365,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_toast",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "55f623712ed1f2bae6d6fadc522a2458e083ccd44cc0a907672547e7b10758a9",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
           "https://github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
       ],
       strip_prefix = "paper-toast-1.3.0",
@@ -1959,12 +2383,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_toggle_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "4aa7cf0396fa2994a8bc2ac6e8428f48b07b945bb7c41bd52041ef5827b45de3",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
           "https://github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
       ],
       strip_prefix = "paper-toggle-button-1.2.0",
@@ -1978,12 +2402,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_toolbar",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "dbddffc0654d9fb5fb48843087eebe16bf7a134902495a664c96c11bf8a2c63d",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
       ],
       strip_prefix = "paper-toolbar-1.1.4",
@@ -1996,12 +2420,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_tooltip",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "4c6667acf01f73da14c3cbc0aa574bf14280304567987ee0314534328377d2ad",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
           "https://github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
       ],
       strip_prefix = "paper-tooltip-1.1.2",
@@ -2013,13 +2437,13 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "07a9e62ffb52193da3af09adda2fbac5cc690439978520e2d03e783863f65f91",
       strip_prefix = "polymer-1.7.0",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/polymer/polymer/archive/v1.7.0.tar.gz",
+          "http://mirror.bazel.build/github.com/polymer/polymer/archive/v1.7.0.tar.gz",
           "https://github.com/polymer/polymer/archive/v1.7.0.tar.gz",
       ],
       path = "/polymer",
@@ -2030,12 +2454,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_prism",
       licenses = ["notice"],  # MIT
       sha256 = "e06eb54f2a80e6b3cd0bd4d59f900423bcaee53fc03998a056df63740c684683",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
+          "http://mirror.bazel.build/github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
           "https://github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
       ],
       strip_prefix = "prism-abee2b7587f1925e57777044270e2a1860810994",
@@ -2046,12 +2470,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_prism_element",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "ad70bf9cd5bbdf525d465e1b0658867ab4022193eb9c74087a839044b46312b4",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
           "https://github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
       ],
       strip_prefix = "prism-element-1.0.4",
@@ -2066,29 +2490,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_promise_polyfill",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "4495450e5d884c3e16b537b43afead7f84d17c7dc061bcfcbf440eac083e4ef5",
       strip_prefix = "promise-polyfill-1.0.0",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
           "https://github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
       ],
       path = "/promise-polyfill",
       srcs = [
-          "Promise.js", "Promise-Statics.js", "promise-polyfill.html",
+          "Promise.js",
+          "Promise-Statics.js",
+          "promise-polyfill.html",
           "promise-polyfill-lite.html"
       ],
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_web_animations_js",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "f8bd760cbdeba131f6790bd5abe170bcbf7b1755ff58ed16d0b82fa8a7f34a7f",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
+          "http://mirror.bazel.build/github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
           "https://github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
       ],
       strip_prefix = "web-animations-js-2.2.1",
@@ -2096,12 +2522,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       srcs = ["web-animations-next-lite.min.js"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_webcomponentsjs",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "138c43306ee0a6d699ddca9b3c6b0f4982974ea8b7bdad291ea7276c72301df9",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
+          "http://mirror.bazel.build/github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
           "https://github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
       ],
       strip_prefix = "webcomponentsjs-0.7.22",
@@ -2121,3 +2547,132 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "webcomponents-lite.min.js",
       ],
   )
+
+  ##############################################################################
+  # TensorBoard Testing Dependencies
+
+  web_library_external(
+      name = "org_npmjs_registry_accessibility_developer_tools",
+      licenses = ["notice"],  # Apache License 2.0
+      sha256 = "1d6a72f401c9d53f68238c617dd43a05cd85ca5aa2e676a5b3c352711448e093",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/accessibility-developer-tools/-/accessibility-developer-tools-2.10.0.tgz",
+          "https://registry.npmjs.org/accessibility-developer-tools/-/accessibility-developer-tools-2.10.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/accessibility-developer-tools",
+      suppress = ["strictDependencies"],
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_async",
+      licenses = ["notice"],  # MIT
+      sha256 = "08655255ae810bf4d1cb1642df57658fcce823776d3ba8f4b46f4bbff6c87ece",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/async/-/async-1.5.0.tgz",
+          "https://registry.npmjs.org/async/-/async-1.5.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/async",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_chai",
+      licenses = ["notice"],  # MIT
+      sha256 = "aca8137bed5bb295bd7173325b7ad604cd2aeb341d739232b4f9f0b26745be90",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/chai/-/chai-3.5.0.tgz",
+          "https://registry.npmjs.org/chai/-/chai-3.5.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/chai",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_mocha",
+      licenses = ["notice"],  # MIT
+      sha256 = "13ef37a071196a2fba680799b906555d3f0ab61e80a7e8f73f93e77914590dd4",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/mocha/-/mocha-2.5.3.tgz",
+          "https://registry.npmjs.org/mocha/-/mocha-2.5.3.tgz",
+      ],
+      suppress = ["strictDependencies"],
+      strip_prefix = "package",
+      path = "/mocha",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_sinon",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "49edb057695fc9019aae992bf7e677a07de7c6ce2bf9f9facde4a245045d1532",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/sinon/-/sinon-1.17.4.tgz",
+          "https://registry.npmjs.org/sinon/-/sinon-1.17.4.tgz",
+      ],
+      strip_prefix = "package/lib",
+      path = "/sinonjs",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_sinon_chai",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "b85fc56f713832960b56fe9269ee4bb2cd41edd2ceb130b0936e5bdbed5dea63",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/sinon-chai/-/sinon-chai-2.8.0.tgz",
+          "https://registry.npmjs.org/sinon-chai/-/sinon-chai-2.8.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/sinon-chai",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_stacky",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "c659e60f7957d9d80c23a7aacc4d71b19c6421a08f91174c0062de369595acae",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/stacky/-/stacky-1.3.1.tgz",
+          "https://registry.npmjs.org/stacky/-/stacky-1.3.1.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/stacky",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_web_component_tester",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "9d4ebd4945df8a936916d4d32b7f280f2a3afa35f79e7ca8ad3ed0a42770c537",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/web-component-tester/-/web-component-tester-4.3.6.tgz",
+          "https://registry.npmjs.org/web-component-tester/-/web-component-tester-4.3.6.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/web-component-tester",
+      suppress = [
+          "absolutePaths",
+          "strictDependencies",
+      ],
+      deps = [
+          "@com_lodash",
+          "@org_npmjs_registry_accessibility_developer_tools",
+          "@org_npmjs_registry_async",
+          "@org_npmjs_registry_chai",
+          "@org_npmjs_registry_mocha",
+          "@org_npmjs_registry_sinon",
+          "@org_npmjs_registry_sinon_chai",
+          "@org_npmjs_registry_stacky",
+          "@org_polymer_test_fixture",
+      ],
+  )
+
+  web_library_external(
+      name = "org_polymer_test_fixture",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "59d6cfb1187733b71275becfea181fe0aa1f734df5ff77f5850c806bbbf9a0d9",
+      strip_prefix = "test-fixture-2.0.1",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/test-fixture/archive/v2.0.1.tar.gz",
+          "https://github.com/PolymerElements/test-fixture/archive/v2.0.1.tar.gz",
+      ],
+      path = "/test-fixture",
+      exclude = ["test/**"],
+  )
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index 00d2e7c0c7813d084ca616bb201645161e271b8c..861a87b68bfd058fdc184335fd19957f624f9fcc 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -1,9 +1,11 @@
-#ifdef _WIN32
-#define sleep(seconds) Sleep(1000*seconds)
-#endif  // _WIN32
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #ifdef _WIN32
+#ifndef SLEEP_FUNC_HEADER_GUARD
+#define SLEEP_FUNC_HEADER_GUARD
+inline void sleep(unsigned int seconds) { Sleep(1000*seconds); }
+#endif
+
 // On Windows, Eigen will include Windows.h, which defines various
 // macros that conflict with TensorFlow symbols. Undefine them here to
 // prevent clashes.
diff --git a/tensorflow/opensource_only/eigen.threadpool b/third_party/eigen3/unsupported/Eigen/CXX11/eigen.threadpool
similarity index 100%
rename from tensorflow/opensource_only/eigen.threadpool
rename to third_party/eigen3/unsupported/Eigen/CXX11/eigen.threadpool
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
deleted file mode 100644
index b77a45c3257c4f9e3865dd9ff58db7cb0285eed7..0000000000000000000000000000000000000000
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ /dev/null
@@ -1,249 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "darwin"
-  toolchain_identifier: "local_darwin"
-}
-default_toolchain {
-  cpu: "ppc"
-  toolchain_identifier: "local_linux"
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  supports_thin_archives: false
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  # As part of the TensorFlow release, we place some cuda-related compilation
-  # files in @local_config_cuda//crosstool/clang/bin, and this relative
-  # path, combined with the rest of our Bazel configuration causes our
-  # compilation to use those files.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
-  # and the device compiler to use "-std=c++11".
-  cxx_flag: "-std=c++11"
-  linker_flag: "-Wl,-no-as-needed"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-%{gcc_host_compiler_includes}
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  # We need to undef it before redefining it as some distributions now have
-  # it enabled by default.
-  compiler_flag: "-U_FORTIFY_SOURCE"
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-  compiler_flag: "-fPIE"
-  linker_flag: "-pie"
-  linker_flag: "-Wl,-z,relro,-z,now"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified. This isn't supported by gcc
-  # on Ubuntu 14.04.
-  # compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wunused-but-set-parameter"
-  # But disable some that are problematic.
-  compiler_flag: "-Wno-free-nonheap-object" # has false positives
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-  unfiltered_cxx_flag: "-fno-canonical-system-headers"
-  # Have gcc return the exit code from ld.
-  linker_flag: "-pass-exit-codes"
-  # Stamp the binary with a unique identifier.
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-  # Gold linker only? Can we enable this by default?
-  # linker_flag: "-Wl,--warn-execstack"
-  # linker_flag: "-Wl,--detect-odr-violations"
-
-  # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
-    # even generally? However, that can't happen here, as it requires special
-    # handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "macosx"
-  target_cpu: "darwin"
-  target_system_name: "local"
-  toolchain_identifier: "local_darwin"
-
-  tool_path { name: "ar" path: "/usr/bin/libtool" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-  cxx_flag: "-std=c++11"
-  ar_flag: "-static"
-  ar_flag: "-s"
-  ar_flag: "-o"
-  linker_flag: "-lc++"
-  linker_flag: "-undefined"
-  linker_flag: "dynamic_lookup"
-  # TODO(ulfjack): This is wrong on so many levels. Figure out a way to auto-detect the proper
-  # setting from the local compiler, and also how to make incremental builds correct.
-  cxx_builtin_include_directory: "/"
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified.
-  compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wthread-safety"
-  compiler_flag: "-Wself-assign"
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-
-  # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or even generally?
-    # However, that can't happen here, as it requires special handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-  }
-}
diff --git a/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
index 116f67cbae4a0a4f9b12fba14e81b8d743b1e7fd..05290d647ea1b25f073f6e0c2a8de07c0fe65d58 100644
--- a/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
@@ -121,7 +121,7 @@ toolchain {
   # linker_flag: "-Wl,--detect-odr-violations"
 
   # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
+%{cuda_include_path}
 
   compilation_mode_flags {
     mode: DBG
@@ -220,7 +220,7 @@ toolchain {
   linker_flag: "-no-canonical-prefixes"
 
   # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
+%{cuda_include_path}
 
   compilation_mode_flags {
     mode: DBG
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index ceebb1e0e639bb98b433be1e9706ac85e9a4b7f3..242439daf456d6fd31a140e5d2c56d3e89900652 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -49,8 +49,7 @@ import pipes
 CPU_COMPILER = ('%{cpu_compiler}')
 GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
 
-CURRENT_DIR = os.path.dirname(sys.argv[0])
-NVCC_PATH = CURRENT_DIR + '/../../../cuda/bin/nvcc'
+NVCC_PATH = '%{nvcc_path}'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
 NVCC_VERSION = '%{cuda_version}'
 
@@ -228,7 +227,7 @@ def InvokeNvcc(argv, log=False):
 
   # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
   # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ' ' + cmd
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
   if log: Log(cmd)
   return os.system(cmd)
 
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 4b996db7a6704e039ea9b1202ab4c7d1d1eb50c0..f7610dd7a99e3c65ac494d23f0a408d4391680c0 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -1,7 +1,5 @@
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 
-load("@local_config_cuda//cuda:platform.bzl", "readlink_command")
-
 package(default_visibility = ["//visibility:public"])
 
 config_setting(
@@ -41,10 +39,10 @@ config_setting(
 
 cc_library(
     name = "cuda_headers",
-    hdrs = glob([
-        "**/*.h",
-        "**/*.hpp",
-    ]),
+    hdrs = [
+        "cuda_config.h",
+        %{cuda_headers}
+    ],
     includes = [
         ".",
         "include",
@@ -55,7 +53,7 @@ cc_library(
 cc_library(
     name = "cudart_static",
     srcs = ["lib/%{cudart_static_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkopts = select({
         ":freebsd": [],
         "//conditions:default": ["-ldl"],
@@ -69,7 +67,7 @@ cc_library(
 cc_library(
     name = "cuda_driver",
     srcs = ["lib/%{cuda_driver_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     visibility = ["//visibility:public"],
 )
 
@@ -77,7 +75,7 @@ cc_library(
     name = "cudart",
     srcs = ["lib/%{cudart_lib}"],
     data = ["lib/%{cudart_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -86,7 +84,7 @@ cc_library(
     name = "cublas",
     srcs = ["lib/%{cublas_lib}"],
     data = ["lib/%{cublas_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -95,7 +93,7 @@ cc_library(
     name = "cusolver",
     srcs = ["lib/%{cusolver_lib}"],
     data = ["lib/%{cusolver_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     linkopts = ["-lgomp"],
     visibility = ["//visibility:public"],
@@ -105,7 +103,7 @@ cc_library(
     name = "cudnn",
     srcs = ["lib/%{cudnn_lib}"],
     data = ["lib/%{cudnn_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -114,7 +112,7 @@ cc_library(
     name = "cufft",
     srcs = ["lib/%{cufft_lib}"],
     data = ["lib/%{cufft_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -123,7 +121,7 @@ cc_library(
     name = "curand",
     srcs = ["lib/%{curand_lib}"],
     data = ["lib/%{curand_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -143,9 +141,10 @@ cc_library(
 
 cc_library(
     name = "cupti_headers",
-    hdrs = glob([
-        "**/*.h",
-    ]),
+    hdrs = [
+        "cuda_config.h",
+        ":cuda-extras",
+    ],
     includes = [
         ".",
         "extras/CUPTI/include/",
@@ -161,6 +160,8 @@ cc_library(
 
 cc_library(
     name = "libdevice_root",
-    data = glob(["nvvm/libdevice/*.bc"]),
+    data = [":cuda-nvvm"],
     visibility = ["//visibility:public"],
 )
+
+%{cuda_include_genrules}
diff --git a/third_party/gpus/cuda/platform.bzl.tpl b/third_party/gpus/cuda/platform.bzl.tpl
deleted file mode 100644
index 01ef24b94edf840126822f55da93fd9a84b4fc73..0000000000000000000000000000000000000000
--- a/third_party/gpus/cuda/platform.bzl.tpl
+++ /dev/null
@@ -1,15 +0,0 @@
-CUDA_VERSION = "%{cuda_version}"
-CUDNN_VERSION = "%{cudnn_version}"
-PLATFORM = "%{platform}"
-
-def cuda_sdk_version():
-  return CUDA_VERSION
-
-def cudnn_sdk_version():
-  return CUDNN_VERSION
-
-def readlink_command():
-  if PLATFORM == "Darwin":
-    return "greadlink"
-  else:
-    return "readlink"
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 60118a95f27522ca2fb567ccce1a83b9373a4251..6994db0a044bf6e324c0aaabd004e9286d8ff7ba 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -147,6 +147,36 @@ def _host_compiler_includes(repository_ctx, cc):
     inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
   return "\n".join(inc_entries)
 
+def _cuda_include_path(repository_ctx, cuda_config):
+  """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
+
+  Args:
+    repository_ctx: The repository context.
+    cc: The path to the gcc host compiler.
+
+  Returns:
+    A string containing the cxx_builtin_include_directory for each of the gcc
+    host compiler include directories, which can be added to the CROSSTOOL
+    file.
+  """
+  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
+                                  (cuda_config.cuda_toolkit_path,
+                                   ".exe" if cuda_config.cpu_value == "Windows" else ""))
+  result = repository_ctx.execute([nvcc_path, '-v',
+                                  '/dev/null', '-o', '/dev/null'])
+  target_dir = ""
+  for one_line in result.stderr.splitlines():
+    if one_line.startswith('#$ _TARGET_DIR_='):
+      target_dir = (cuda_config.cuda_toolkit_path + '/' +
+                    one_line.replace('#$ _TARGET_DIR_=', '') + "/include")
+  inc_entries = []
+  if target_dir != "":
+    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
+  default_include = cuda_config.cuda_toolkit_path + '/include'
+  inc_entries.append("  cxx_builtin_include_directory: \"%s\"" %
+                     default_include)
+  return "\n".join(inc_entries)
+
 
 def _enable_cuda(repository_ctx):
   if "TF_NEED_CUDA" in repository_ctx.os.environ:
@@ -699,12 +729,8 @@ def _create_dummy_repository(repository_ctx):
            "%{cufft_lib}": _lib_name("cufft", cpu_value),
            "%{curand_lib}": _lib_name("curand", cpu_value),
            "%{cupti_lib}": _lib_name("cupti", cpu_value),
-       })
-  _tpl(repository_ctx, "cuda:platform.bzl",
-       {
-           "%{cuda_version}": _DEFAULT_CUDA_VERSION,
-           "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
-           "%{platform}": cpu_value,
+           "%{cuda_include_genrules}": '',
+           "%{cuda_headers}": '',
        })
 
   # Create dummy files for the CUDA toolkit since they are still required by
@@ -742,17 +768,64 @@ def _create_dummy_repository(repository_ctx):
                       _DUMMY_CROSSTOOL_BZL_FILE)
   repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
-def _symlink_dir(repository_ctx, src_dir, dest_dir):
-  """Symlinks all the files in a directory.
 
-  Args:
-    repository_ctx: The repository context.
-    src_dir: The source directory.
-    dest_dir: The destination directory to create the symlinks in.
+def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
+    src_files = [], dest_files = []):
+  """Returns a genrule to symlink a set of files.
+
+  If src_dir is passed, files will be read from the given directory; otherwise
+  we assume files are in src_files and dest_files
+  """
+  if src_dir != None:
+    files = _read_dir(repository_ctx, src_dir)
+    # Create a list with the src_dir stripped to use for outputs.
+    dest_files = files.replace(src_dir, '').splitlines()
+    src_files = files.splitlines()
+  command = []
+  outs = []
+  for i in range(len(dest_files)):
+    if dest_files[i] != "":
+      # If we have only one file to link we do not want to use the dest_dir, as
+      # $(@D) will include the full path to the file.
+      dest = ' $(@D)/' + dest_dir + dest_files[i] if len(dest_files) != 1 else ' $(@D)' + dest_files[i]
+      command.append('ln -s ' + src_files[i] + dest)
+      outs.append('      "' + dest_dir + dest_files[i] + '",')
+  genrule = _genrule(src_dir, genrule_name, " && ".join(command),
+                     "\n".join(outs))
+  return genrule
+
+
+def _genrule(src_dir, genrule_name, command, outs):
+  """Returns a string with a genrule.
+
+  Genrule executes the given command and produces the given outputs.
+  """
+  return (
+      'genrule(\n' +
+      '    name = "' +
+      genrule_name + '",\n' +
+      '    outs = [\n' +
+      outs +
+      '    ],\n' +
+      '    cmd = """\n' +
+      command +
+      '    """,\n' +
+      ')\n\n'
+  )
+
+
+def _read_dir(repository_ctx, src_dir):
+  """Returns a string with all files in a directory.
+
+  Finds all files inside a directory, traversing subfolders and following
+  symlinks. The returned string contains the full path of all files
+  separated by line breaks.
   """
-  files = repository_ctx.path(src_dir).readdir()
-  for src_file in files:
-    repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
+  find_result = repository_ctx.execute([
+      "find", src_dir, "-follow", "-type", "f"
+  ])
+  return find_result.stdout
+
 
 def _use_cuda_clang(repository_ctx):
   if "TF_CUDA_CLANG" in repository_ctx.os.environ:
@@ -775,25 +848,42 @@ def _create_cuda_repository(repository_ctx):
   cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
                                             cuda_config.cudnn_install_basedir)
 
-  # Set up symbolic links for the cuda toolkit. We link at the individual file
-  # level not at the directory level. This is because the external library may
-  # have a different file layout from our desired structure.
+  # Set up symbolic links for the cuda toolkit by creating genrules to do
+  # symlinking. We create one genrule for each directory we want to track under
+  # cuda_toolkit_path
   cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/include", "cuda/include")
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/bin", "cuda/bin")
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/nvvm", "cuda/nvvm")
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/extras/CUPTI/include",
-               "cuda/extras/CUPTI/include")
+  cuda_include_path = cuda_toolkit_path + "/include"
+  genrules = [_symlink_genrule_for_dir(repository_ctx,
+      cuda_include_path, "include", "cuda-include")]
+  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+      cuda_toolkit_path + "/nvvm", "nvvm", "cuda-nvvm"))
+  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+      cuda_toolkit_path + "/extras/CUPTI/include",
+      "extras/CUPTI/include", "cuda-extras"))
 
   cuda_libs = _find_libs(repository_ctx, cuda_config)
+  cuda_lib_src = []
+  cuda_lib_dest = []
   for lib in cuda_libs.values():
-    repository_ctx.symlink(lib.path, "cuda/lib/" + lib.file_name)
+    cuda_lib_src.append(lib.path)
+    cuda_lib_dest.append("lib/" + lib.file_name)
+  genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
+                                       cuda_lib_src, cuda_lib_dest))
 
   # Set up the symbolic links for cudnn if cudnn was was not installed to
   # CUDA_TOOLKIT_PATH.
-  if not repository_ctx.path("cuda/include/cudnn.h").exists:
-    repository_ctx.symlink(cudnn_header_dir + "/cudnn.h",
-                           "cuda/include/cudnn.h")
+  included_files = _read_dir(repository_ctx, cuda_include_path).replace(
+      cuda_include_path, '').splitlines()
+  if '/cudnn.h' not in included_files:
+    genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "",
+        "cudnn-include", [cudnn_header_dir + "/cudnn.h"], ["include/cudnn.h"]))
+  else:
+    genrules.append(
+            'filegroup(\n' +
+            '    name = "cudnn-include",\n' +
+            '    srcs = [],\n' +
+            ')\n'
+        )
 
   # Set up BUILD file for cuda/
   _tpl(repository_ctx, "cuda:build_defs.bzl",
@@ -815,33 +905,33 @@ def _create_cuda_repository(repository_ctx):
            "%{cufft_lib}": cuda_libs["cufft"].file_name,
            "%{curand_lib}": cuda_libs["curand"].file_name,
            "%{cupti_lib}": cuda_libs["cupti"].file_name,
+           "%{cuda_include_genrules}": "\n".join(genrules),
+           "%{cuda_headers}": ('":cuda-include",\n' +
+                               '        ":cudnn-include",')
        })
-
-  _tpl(repository_ctx, "cuda:platform.bzl",
-       {
-           "%{cuda_version}": cuda_config.cuda_version,
-           "%{cudnn_version}": cuda_config.cudnn_version,
-           "%{platform}": cuda_config.cpu_value,
-       })
-
   # Set up crosstool/
   _file(repository_ctx, "crosstool:BUILD")
   cc = find_cc(repository_ctx)
   host_compiler_includes = _host_compiler_includes(repository_ctx, cc)
   cuda_defines = {
-           "%{cuda_include_path}": cuda_config.cuda_toolkit_path + '/include',
+           "%{cuda_include_path}": _cuda_include_path(repository_ctx,
+                                                      cuda_config),
            "%{host_compiler_includes}": host_compiler_includes,
        }
   if _use_cuda_clang(repository_ctx):
     cuda_defines["%{clang_path}"] = cc
     _tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL")
   else:
+    nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
+        (cuda_config.cuda_toolkit_path,
+        ".exe" if cuda_config.cpu_value == "Windows" else "")))
     _tpl(repository_ctx, "crosstool:CROSSTOOL_nvcc", cuda_defines, out="crosstool/CROSSTOOL")
     _tpl(repository_ctx,
          "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
          {
              "%{cpu_compiler}": str(cc),
              "%{cuda_version}": cuda_config.cuda_version,
+             "%{nvcc_path}": nvcc_path,
              "%{gcc_host_compiler_path}": str(cc),
              "%{cuda_compute_capabilities}": ", ".join(
                  ["\"%s\"" % c for c in cuda_config.compute_capabilities]),
diff --git a/third_party/grpc.BUILD b/third_party/grpc.BUILD
index 1d1e2222dea8044b046a7cddb22a009542553e8c..b79259618f2f06c941b5a8e3427dd0d5a0fe1e40 100644
--- a/third_party/grpc.BUILD
+++ b/third_party/grpc.BUILD
@@ -176,8 +176,7 @@ cc_library(
         ".",
         "include",
     ],
-    deps = [
-    ],
+    linkopts = ["-lpthread"],
 )
 
 cc_library(
@@ -1782,6 +1781,7 @@ cc_library(
         ".",
         "include",
     ],
+    linkopts = ["-lpthread"],
     deps = [
         ":gpr",
         ":grpc_unsecure",
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
index 8ed13c51a5d4cd4bed78d9de3f54b017d4d1f42c..3a9a9a80f2e5aa433c27ecb65fa279f146f43da6 100644
--- a/third_party/jemalloc.BUILD
+++ b/third_party/jemalloc.BUILD
@@ -94,6 +94,9 @@ cc_library(
         "@%ws%//tensorflow:linux_ppc64le": [
             "-lpthread",
         ],
+        "@%ws%//tensorflow:linux_x86_64": [
+            "-lpthread",
+        ],
         "//conditions:default": [
         ],
     }),
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index d5ab32628355390dca0d41dc8def3c5876cfddfd..15aa53962d1ac0c90da57fd246af9b57216b2ef6 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -366,6 +366,7 @@ llvm_target_list = [
             ("-gen-asm-matcher", "lib/Target/ARM/ARMGenAsmMatcher.inc"),
             ("-gen-dag-isel", "lib/Target/ARM/ARMGenDAGISel.inc"),
             ("-gen-fast-isel", "lib/Target/ARM/ARMGenFastISel.inc"),
+            ("-gen-global-isel", "lib/Target/ARM/ARMGenGlobalISel.inc"),
             ("-gen-callingconv", "lib/Target/ARM/ARMGenCallingConv.inc"),
             ("-gen-subtarget", "lib/Target/ARM/ARMGenSubtargetInfo.inc"),
             ("-gen-disassembler", "lib/Target/ARM/ARMGenDisassemblerTables.inc"),
@@ -440,6 +441,16 @@ llvm_target_list = [
     for target in llvm_target_list
 ]
 
+# This target is used to provide *.def files to x86_code_gen.
+# Files with '.def' extension are not allowed in 'srcs' of 'cc_library' rule.
+cc_library(
+    name = "x86_defs",
+    hdrs = glob([
+        "lib/Target/X86/*.def",
+    ]),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "aarch64_asm_parser",
     srcs = glob([
@@ -627,6 +638,7 @@ cc_library(
         "lib/Analysis/*.cpp",
         "lib/Analysis/*.inc",
         "include/llvm/Transforms/Utils/Local.h",
+        "include/llvm/Transforms/Scalar.h",
         "lib/Analysis/*.h",
     ]),
     hdrs = glob([
@@ -729,6 +741,7 @@ cc_library(
         "lib/Target/ARM/MCTargetDesc/*.cpp",
         "lib/Target/ARM/MCTargetDesc/*.inc",
         "lib/Target/ARM/*.h",
+        "include/llvm/CodeGen/GlobalISel/GISelAccessor.h",
     ]),
     hdrs = glob([
         "include/llvm/Target/ARM/MCTargetDesc/*.h",
@@ -909,6 +922,7 @@ cc_library(
         ":bit_writer",
         ":config",
         ":core",
+        ":instrumentation",
         ":mc",
         ":profile_data",
         ":scalar",
@@ -1107,6 +1121,9 @@ cc_library(
         "lib/Transforms/IPO/*.c",
         "lib/Transforms/IPO/*.cpp",
         "lib/Transforms/IPO/*.inc",
+        "include/llvm/Transforms/SampleProfile.h",
+        "include/llvm-c/Transforms/IPO.h",
+        "include/llvm-c/Transforms/PassManagerBuilder.h",
         "lib/Transforms/IPO/*.h",
     ]),
     hdrs = glob([
@@ -1116,6 +1133,7 @@ cc_library(
     ]),
     deps = [
         ":analysis",
+        ":bit_reader",
         ":bit_writer",
         ":config",
         ":core",
@@ -1372,6 +1390,7 @@ cc_library(
         "lib/Transforms/ObjCARC/*.c",
         "lib/Transforms/ObjCARC/*.cpp",
         "lib/Transforms/ObjCARC/*.inc",
+        "include/llvm/Transforms/ObjCARC.h",
         "lib/Transforms/ObjCARC/*.h",
     ]),
     hdrs = glob([
@@ -1681,6 +1700,7 @@ cc_library(
         "lib/Support/Unix/*.inc",
         "lib/Support/Unix/*.h",
         "include/llvm-c/*.h",
+        "include/llvm/CodeGen/MachineValueType.h",
         "lib/Support/*.h",
     ]),
     hdrs = glob([
@@ -1689,7 +1709,11 @@ cc_library(
         "include/llvm/Support/*.inc",
         "include/llvm/ADT/*.h",
         "include/llvm/Support/ELFRelocs/*.def",
-    ]) + ["include/llvm/Support/DataTypes.h"],
+        "include/llvm/Support/WasmRelocs/*.def",
+    ]) + [
+        "include/llvm/Support/DataTypes.h",
+        "include/llvm/ExecutionEngine/ObjectMemoryBuffer.h",
+    ],
     deps = [
         ":config",
         ":demangle",
@@ -1786,6 +1810,7 @@ cc_library(
         ":analysis",
         ":config",
         ":core",
+        ":scalar",
         ":support",
         ":transform_utils",
     ],
@@ -1865,6 +1890,7 @@ cc_library(
         ":support",
         ":target",
         ":x86_asm_printer",
+        ":x86_defs",
         ":x86_desc",
         ":x86_info",
         ":x86_utils",
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index 7e95ebd35514649df3d9ac172aca9763b88c4896..8c86766effa97a08f6089194a5d9202da0e003b3 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -16,6 +16,7 @@ load(
 cc_library(
     name = "intel_binary_blob",
     srcs = if_mkl([
+        "libdl.so.2",
         "libmklml_intel.so",
         "libiomp5.so",
     ]),
diff --git a/third_party/nccl/BUILD b/third_party/nccl/BUILD
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/third_party/nccl/fix_clang_compilation.patch b/third_party/nccl/fix_clang_compilation.patch
deleted file mode 100644
index e8d2a7dc9f30d9a0ee1149864f37a209fa955660..0000000000000000000000000000000000000000
--- a/third_party/nccl/fix_clang_compilation.patch
+++ /dev/null
@@ -1,85 +0,0 @@
-From 8241cd7b6ed1425eeb88fd380090575978e358f4 Mon Sep 17 00:00:00 2001
-From: Ilya Biryukov <ibiryukov@google.com>
-Date: Thu, 16 Mar 2017 12:01:11 +0100
-Subject: [PATCH 1/1] Fix compilation error when compiling with 'clang -x
- cuda'.
-
-Functions vFetch and vStore are not found by ADL with clang,
-so they need to be declared before usage in ReduceCopy.
----
- src/common_kernel.h | 52 ++++++++++++++++++++++++++--------------------------
- 1 file changed, 26 insertions(+), 26 deletions(-)
-
-diff --git a/src/common_kernel.h b/src/common_kernel.h
-index 28fbc85..cc71f8a 100644
---- a/src/common_kernel.h
-+++ b/src/common_kernel.h
-@@ -30,6 +30,32 @@
- #define BAR(type, barid, nthreads) \
-     BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
- 
-+template<typename T> inline __device__
-+T vFetch(const volatile T* ptr) {
-+  return *ptr;
-+}
-+
-+#ifdef CUDA_HAS_HALF
-+template<> inline __device__
-+half vFetch<half>(const volatile half* ptr) {
-+  half r;
-+  r.x = ptr->x;
-+  return r;
-+}
-+#endif
-+
-+template<typename T> inline __device__
-+void vStore(volatile T* ptr, const T val) {
-+  *ptr = val;
-+}
-+
-+#ifdef CUDA_HAS_HALF
-+template<> inline __device__
-+void vStore<half>(volatile half* ptr, const half val) {
-+  ptr->x = val.x;
-+}
-+#endif
-+
- __device__ unsigned int spinct;
- 
- // Spin wait until func evaluates to true
-@@ -225,32 +251,6 @@ __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-   return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
- }
- 
--template<typename T> inline __device__
--T vFetch(const volatile T* ptr) {
--  return *ptr;
--}
--
--#ifdef CUDA_HAS_HALF
--template<> inline __device__
--half vFetch<half>(const volatile half* ptr) {
--  half r;
--  r.x = ptr->x;
--  return r;
--}
--#endif
--
--template<typename T> inline __device__
--void vStore(volatile T* ptr, const T val) {
--  *ptr = val;
--}
--
--#ifdef CUDA_HAS_HALF
--template<> inline __device__
--void vStore<half>(volatile half* ptr, const half val) {
--  ptr->x = val.x;
--}
--#endif
--
- // Assumptions:
- // - there is exactly 1 block
- // - THREADS is the number of producer threads
--- 
-2.12.0.367.g23dc2f6d3c-goog
-
diff --git a/third_party/nccl/nccl.BUILD b/third_party/nccl/nccl.BUILD
deleted file mode 100644
index 06b9b8ff68a5e8aa877d605daf02bec1ea4d6bfa..0000000000000000000000000000000000000000
--- a/third_party/nccl/nccl.BUILD
+++ /dev/null
@@ -1,66 +0,0 @@
-# NVIDIA nccl
-# A package of optimized primitives for collective multi-GPU communication.
-
-licenses(["notice"])  # BSD
-
-exports_files(["LICENSE.txt"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
-
-SRCS = [
-    "src/all_gather.cu",
-    "src/all_reduce.cu",
-    "src/broadcast.cu",
-    "src/core.cu",
-    "src/libwrap.cu",
-    "src/reduce.cu",
-    "src/reduce_scatter.cu",
-]
-
-# Copy .cu to .cu.cc so they can be in srcs of cc_library.
-[
-    genrule(
-        name = "gen_" + src,
-        srcs = [src],
-        outs = [src + ".cc"],
-        cmd = "cp $(location " + src + ") $(location " + src + ".cc)",
-    )
-    for src in SRCS
-]
-
-SRCS_CU_CC = [src + ".cc" for src in SRCS]
-
-cc_library(
-    name = "nccl",
-    srcs = if_cuda(SRCS_CU_CC + glob(["src/*.h"])),
-    hdrs = if_cuda(["src/nccl.h"]),
-    copts = [
-        "-DCUDA_MAJOR=0",
-        "-DCUDA_MINOR=0",
-        "-DNCCL_MAJOR=0",
-        "-DNCCL_MINOR=0",
-        "-DNCCL_PATCH=0",
-        "-Iexternal/nccl_archive/src",
-        "-O3",
-    ] + cuda_default_copts(),
-    linkopts = select({
-        "@%ws%//tensorflow:android": [
-            "-pie",
-        ],
-        "@%ws%//tensorflow:darwin": [
-            "-Wl,-framework",
-            "-Wl,CoreFoundation",
-            "-Wl,-framework",
-            "-Wl,Security",
-        ],
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [
-            "ws2_32.lib",
-        ],
-        "//conditions:default": [
-            "-lrt",
-        ],
-    }),
-    visibility = ["//visibility:public"],
-    deps = ["@local_config_cuda//cuda:cuda_headers"],
-)
diff --git a/third_party/ortools.BUILD b/third_party/ortools.BUILD
index 1ebc8aa0be894590268632a84b08452afafb050f..61191e3d2711c955725078715c1b8238edfe069e 100644
--- a/third_party/ortools.BUILD
+++ b/third_party/ortools.BUILD
@@ -7,10 +7,7 @@ exports_files(["LICENSE-2.0.txt"])
 native.cc_library(
     name = "linear_solver_glop",
     deps = [
-    "@ortools_archive//linear_solver:linear_solver_glop",
-	 ],
+        "@ortools_archive//linear_solver:linear_solver_glop",
+    ],
     visibility = ["//visibility:public"],
 )
-
-
-
diff --git a/third_party/pprof.BUILD b/third_party/pprof.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..edd52095949cfdeff5cde3a1c696fe419b01a016
--- /dev/null
+++ b/third_party/pprof.BUILD
@@ -0,0 +1,18 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # MIT
+
+load("@protobuf//:protobuf.bzl", "py_proto_library")
+
+exports_files(["pprof/LICENSE"])
+
+py_proto_library(
+    name = "pprof_proto_py",
+    srcs = ["proto/profile.proto"],
+    default_runtime = "@protobuf//:protobuf_python",
+    protoc = "@protobuf//:protoc",
+    srcs_version = "PY2AND3",
+    deps = ["@protobuf//:protobuf_python"],
+)
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index d49d4c178153a42cf1300a95f2de6d0c30ff518a..b2d0e250e78f0931d14cecabf339512dba85a26a 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -6,11 +6,13 @@
   * `NUMPY_INCLUDE_PATH`: Location of Numpy libraries.
   * `PYTHON_BIN_PATH`: location of python binary.
   * `PYTHON_INCLUDE_PATH`: Location of python binaries.
+  * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
 _NUMPY_INCLUDE_PATH = "NUMPY_INCLUDE_PATH"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_INCLUDE_PATH = "PYTHON_INCLUDE_PATH"
+_PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 
 
 def _tpl(repository_ctx, tpl, substitutions={}, out=None):
@@ -56,21 +58,53 @@ def _is_windows(repository_ctx):
   return False
 
 
+def _execute(repository_ctx, cmdline, error_msg=None, error_details=None,
+             empty_stdout_fine=False):
+  """Executes an arbitrary shell command.
+
+  Args:
+    repository_ctx: the repository_ctx object
+    cmdline: list of strings, the command to execute
+    error_msg: string, a summary of the error if the command fails
+    error_details: string, details about the error or steps to fix it
+    empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+      it's an error
+  Return:
+    the result of repository_ctx.execute(cmdline)
+  """
+  result = repository_ctx.execute(cmdline)
+  if result.stderr or not (empty_stdout_fine or result.stdout):
+    _python_configure_fail(
+        "\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else ""]))
+  return result
+
+
 def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name):
   """returns a genrule to symlink all files in a directory."""
   # Get the list of files under this directory
   find_result = None
   if _is_windows(repository_ctx):
-    find_result = repository_ctx.execute([
-        "dir", src_dir, "/b", "/s", "/a-d",
-    ])
+    find_result = _execute(
+        repository_ctx,
+        ["cmd.exe", "/c", "dir", src_dir.replace("/", "\\"), "/b", "/s",
+         "/a-d"],
+        empty_stdout_fine=True)
+    # src_files will be used to compute BUILD rules, where path must use
+    # forward slashes.
+    src_files = find_result.stdout.replace("\\", "/").splitlines()
+    # Create a list with the src_dir stripped to use for outputs.
+    fwdslashes_src_dir = src_dir.replace("\\", "/")
+    dest_files = [e.replace(fwdslashes_src_dir, "") for e in src_files]
   else:
-    find_result = repository_ctx.execute([
-        "find", src_dir, "-follow", "-type", "f",
-    ])
-  # Create a list with the src_dir stripped to use for outputs.
-  dest_files = find_result.stdout.replace(src_dir, '').splitlines()
-  src_files = find_result.stdout.splitlines()
+    find_result = _execute(
+        repository_ctx, ["find", src_dir, "-follow", "-type", "f"],
+        empty_stdout_fine=True)
+    # Create a list with the src_dir stripped to use for outputs.
+    dest_files = find_result.stdout.replace(src_dir, '').splitlines()
+    src_files = find_result.stdout.splitlines()
   command = []
   command_windows = []
   outs = []
@@ -110,10 +144,52 @@ def _genrule(src_dir, genrule_name, command, outs):
       '    cmd = """\n' +
       command +
       '    """,\n' +
+      '    visibility = ["//visibility:private"],' +
       ')\n'
   )
 
 
+def _get_python_lib(repository_ctx, python_bin):
+  """Gets the python lib path."""
+  print_lib = ("<<END\n" +
+      "from __future__ import print_function\n" +
+      "import site\n" +
+      "import os\n" +
+      "\n" +
+      "try:\n" +
+      "  input = raw_input\n" +
+      "except NameError:\n" +
+      "  pass\n" +
+      "\n" +
+      "python_paths = []\n" +
+      "if os.getenv('PYTHONPATH') is not None:\n" +
+      "  python_paths = os.getenv('PYTHONPATH').split(':')\n" +
+      "try:\n" +
+      "  library_paths = site.getsitepackages()\n" +
+      "except AttributeError:\n" +
+      " from distutils.sysconfig import get_python_lib\n" +
+      " library_paths = [get_python_lib()]\n" +
+      "all_paths = set(python_paths + library_paths)\n" +
+      "paths = []\n" +
+      "for path in all_paths:\n" +
+      "  if os.path.isdir(path):\n" +
+      "    paths.append(path)\n" +
+      "if len(paths) >=1:\n" +
+      "  print(paths[0])\n" +
+      "END")
+  cmd = '%s - %s' % (python_bin, print_lib)
+  result = repository_ctx.execute(["bash", "-c", cmd])
+  return result.stdout.strip('\n')
+
+
+def _check_python_lib(repository_ctx, python_lib):
+  """Checks the python lib path."""
+  cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
+  result = repository_ctx.execute(["bash", "-c", cmd])
+  if result.return_code == 1:
+    _python_configure_fail("Invalid python library path:  %s" % python_lib)
+
+
 def _check_python_bin(repository_ctx, python_bin):
   """Checks the python bin path."""
   cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
@@ -125,37 +201,41 @@ def _check_python_bin(repository_ctx, python_bin):
 
 def _get_python_include(repository_ctx, python_bin):
   """Gets the python include path."""
-  result = repository_ctx.execute([python_bin, "-c",
-                                   'from __future__ import print_function;' +
-                                   'from distutils import sysconfig;' +
-                                   'print(sysconfig.get_python_inc())'])
-  if result == "":
-    _python_configure_fail(
-        "Problem getting python include path.  Is distutils installed?")
+  result = _execute(repository_ctx,
+                    [python_bin, "-c",
+                     'from __future__ import print_function;' +
+                     'from distutils import sysconfig;' +
+                     'print(sysconfig.get_python_inc())'],
+                    error_msg="Problem getting python include path.",
+                    error_details=("Is the Python binary path set up right? " +
+                                   "(See ./configure or BAZEL_BIN_PATH.) " +
+                                   "Is distutils installed?"))
   return result.stdout.splitlines()[0]
 
 
 def _get_numpy_include(repository_ctx, python_bin):
   """Gets the numpy include path."""
-  result = repository_ctx.execute([python_bin, "-c",
-                                   'from __future__ import print_function;' +
-                                   'import numpy;' +
-                                   ' print(numpy.get_include());'])
-  if result == "":
-    _python_configure_fail(
-        "Problem getting numpy include path.  Is numpy installed?")
-  return result.stdout.splitlines()[0]
+  return _execute(repository_ctx,
+                  [python_bin, "-c",
+                   'from __future__ import print_function;' +
+                   'import numpy;' +
+                   ' print(numpy.get_include());'],
+                  error_msg="Problem getting numpy include path.",
+                  error_details="Is numpy installed?").stdout.splitlines()[0]
 
 
-def _create_python_repository(repository_ctx):
+def _create_local_python_repository(repository_ctx):
   """Creates the repository containing files set up to build with Python."""
   python_include = None
   numpy_include = None
+  empty_config = False
   # If local checks were requested, the python and numpy include will be auto
   # detected on the host config (using _PYTHON_BIN_PATH).
   if repository_ctx.attr.local_checks:
     python_bin = _get_env_var(repository_ctx, _PYTHON_BIN_PATH)
     _check_python_bin(repository_ctx, python_bin)
+    python_lib = _get_env_var(repository_ctx, _PYTHON_LIB_PATH, _get_python_lib(repository_ctx, python_bin))
+    _check_python_lib(repository_ctx, python_lib)
     python_include = _get_python_include(repository_ctx, python_bin)
     numpy_include = _get_numpy_include(repository_ctx, python_bin) + '/numpy'
   else:
@@ -164,20 +244,42 @@ def _create_python_repository(repository_ctx):
                                   repository_ctx.attr.python_include)
     numpy_include = _get_env_var(repository_ctx, _NUMPY_INCLUDE_PATH,
                                  repository_ctx.attr.numpy_include) + '/numpy'
-
-  python_include_rule = _symlink_genrule_for_dir(
-      repository_ctx, python_include, 'python_include', 'python_include')
-  numpy_include_rule = _symlink_genrule_for_dir(
-      repository_ctx, numpy_include, 'numpy_include/numpy', 'numpy_include')
-  _tpl(repository_ctx, "BUILD", {
-      "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
-      "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
-  })
+  if empty_config:
+    _tpl(repository_ctx, "BUILD", {
+        "%{PYTHON_INCLUDE_GENRULE}": ('filegroup(\n' +
+                                      '    name = "python_include",\n' +
+                                      '    srcs = [],\n' +
+                                      ')\n'),
+        "%{NUMPY_INCLUDE_GENRULE}": ('filegroup(\n' +
+                                      '    name = "numpy_include",\n' +
+                                      '    srcs = [],\n' +
+                                      ')\n'),
+    })
+  else:
+    python_include_rule = _symlink_genrule_for_dir(
+        repository_ctx, python_include, 'python_include', 'python_include')
+    numpy_include_rule = _symlink_genrule_for_dir(
+        repository_ctx, numpy_include, 'numpy_include/numpy', 'numpy_include')
+    _tpl(repository_ctx, "BUILD", {
+        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
+        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
+    })
+
+
+def _create_remote_python_repository(repository_ctx):
+  """Creates pointers to a remotely configured repo set up to build with Python.
+  """
+  _tpl(repository_ctx, "remote.BUILD", {
+      "%{REMOTE_PYTHON_REPO}": repository_ctx.attr.remote_config_repo,
+  }, "BUILD")
 
 
 def _python_autoconf_impl(repository_ctx):
   """Implementation of the python_autoconf repository rule."""
-  _create_python_repository(repository_ctx)
+  if repository_ctx.attr.remote_config_repo != "":
+    _create_remote_python_repository(repository_ctx)
+  else:
+    _create_local_python_repository(repository_ctx)
 
 
 python_configure = repository_rule(
@@ -186,10 +288,12 @@ python_configure = repository_rule(
         "local_checks": attr.bool(mandatory = False, default = True),
         "python_include": attr.string(mandatory = False),
         "numpy_include": attr.string(mandatory = False),
+        "remote_config_repo": attr.string(mandatory = False, default =""),
     },
     environ = [
         _PYTHON_BIN_PATH,
         _PYTHON_INCLUDE_PATH,
+        _PYTHON_LIB_PATH,
         _NUMPY_INCLUDE_PATH,
     ],
 )
diff --git a/third_party/py/remote.BUILD.tpl b/third_party/py/remote.BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..332508ec84c1d1f79f28382deb98a2344e4d95d4
--- /dev/null
+++ b/third_party/py/remote.BUILD.tpl
@@ -0,0 +1,13 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+alias(
+    name = "python_headers",
+    actual = "@%{REMOTE_PYTHON_REPO}//:python_headers",
+)
+
+alias(
+    name = "numpy_headers",
+    actual = "@%{REMOTE_PYTHON_REPO}//:numpy_headers",
+)
diff --git a/tools/bazel.rc.template b/tools/bazel.rc
similarity index 73%
rename from tools/bazel.rc.template
rename to tools/bazel.rc
index 097ff7b9d07aba67d5d35979e09eb08ad9afdbda..e67a290cf40ca7f688dfdb03210786c8c85abe48 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc
@@ -14,18 +14,9 @@ build:sycl --define=using_sycl=true
 build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
 
-build --force_python=py$PYTHON_MAJOR_VERSION
-build --host_force_python=py$PYTHON_MAJOR_VERSION
-build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true
 
-build --define PYTHON_BIN_PATH=$PYTHON_BINARY
-test --define PYTHON_BIN_PATH=$PYTHON_BINARY
-test --force_python=py$PYTHON_MAJOR_VERSION
-test --host_force_python=py$PYTHON_MAJOR_VERSION
-run --define PYTHON_BIN_PATH=$PYTHON_BINARY
-
 build --spawn_strategy=standalone
 test --spawn_strategy=standalone
 run --spawn_strategy=standalone
diff --git a/tools/tf_env_collect.sh b/tools/tf_env_collect.sh
index 2a92e9a078a01e7f30a394b10b10520b7cb8d097..abeebeadea49bbbf808813c2d3609558e75785d5 100755
--- a/tools/tf_env_collect.sh
+++ b/tools/tf_env_collect.sh
@@ -1,4 +1,18 @@
 #!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 
 set -u  # Check for undefined variables
 
@@ -28,7 +42,7 @@ fi
 
 echo >> $OUTPUT_FILE
 echo '== compiler =====================================================' >> $OUTPUT_FILE
-c++ --version &>> $OUTPUT_FILE
+c++ --version 2>&1 >> $OUTPUT_FILE
 
 echo >> $OUTPUT_FILE
 echo '== uname -a =====================================================' >> $OUTPUT_FILE
@@ -36,7 +50,7 @@ uname -a >> $OUTPUT_FILE
 
 echo >> $OUTPUT_FILE
 echo '== check pips ===================================================' >> $OUTPUT_FILE
-pip list 2>&1 | grep "proto\|numpy\|tensorflow" &>> $OUTPUT_FILE
+pip list 2>&1 | grep "proto\|numpy\|tensorflow" >> $OUTPUT_FILE
 
 
 echo >> $OUTPUT_FILE
@@ -53,7 +67,7 @@ print("tf.COMPILER_VERSION = %s" % tf.GIT_VERSION)
 with tf.Session() as sess:
   print("Sanity check: %r" % sess.run(tf.constant([1,2,3])[:1]))
 EOF
-python /tmp/check_tf.py &>> ${OUTPUT_FILE}
+python /tmp/check_tf.py 2>&1  >> ${OUTPUT_FILE}
 
 DEBUG_LD=libs python -c "import tensorflow"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
 grep libcudnn.so /tmp/loadedlibs >> $OUTPUT_FILE
@@ -74,7 +88,7 @@ fi
 
 echo >> $OUTPUT_FILE >> $OUTPUT_FILE
 echo '== nvidia-smi ===================================================' >> $OUTPUT_FILE
-nvidia-smi &>> $OUTPUT_FILE
+nvidia-smi 2>&1 >> $OUTPUT_FILE
 
 echo >> $OUTPUT_FILE
 
diff --git a/util/python/python_config.sh b/util/python/python_config.sh
deleted file mode 100755
index d5762ad4561bd1bbc3f57f11a2ad53fb114766c6..0000000000000000000000000000000000000000
--- a/util/python/python_config.sh
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -e -o errexit
-
-if [ -d "../org_tensorflow" ]; then
-  script_path="../org_tensorflow"
-else
-  # Prefix expected paths with ./ locally and external/reponame/ for remote repos.
-  # TODO(kchodorow): remove once runfiles paths are fixed, see
-  # https://github.com/bazelbuild/bazel/issues/848.
-  script_path=$(dirname $(dirname $(dirname "$0")))
-  script_path=${script_path:-.}
-fi
-
-function main {
-  setup_python "$1"
-  exit 0
-}
-
-function python_path {
-  "$PYTHON_BIN_PATH" - <<END
-from __future__ import print_function
-import site
-import os
-
-try:
-  input = raw_input
-except NameError:
-  pass
-
-python_paths = []
-if os.getenv('PYTHONPATH') is not None:
-  python_paths = os.getenv('PYTHONPATH').split(':')
-try:
-  library_paths = site.getsitepackages()
-except AttributeError:
- from distutils.sysconfig import get_python_lib
- library_paths = [get_python_lib()]
-all_paths = set(python_paths + library_paths)
-
-paths = []
-for path in all_paths:
-  if os.path.isdir(path):
-    paths.append(path)
-
-if len(paths) == 1:
-  print(paths[0])
-else:
-  ret_paths = ",".join(paths)
-  print(ret_paths)
-END
-}
-
-function default_python_path {
-  PYTHON_ARG="$1" "$PYTHON_BIN_PATH" - <<END
-from __future__ import print_function
-import os
-
-default = os.getenv('PYTHON_ARG')
-default = str(default)
-print(default)
-END
-}
-
-function setup_python {
-  PYTHON_BIN_PATH="$1";
-
-  # TODO(ngiraldo): move most of these checks to root configure
-  if [ -z "$PYTHON_BIN_PATH" ]; then
-    echo "PYTHON_BIN_PATH was not provided.  Did you run configure?"
-    exit 1
-  fi
-  if [ ! -x "$PYTHON_BIN_PATH" ]  || [ -d "$PYTHON_BIN_PATH" ]; then
-    echo "PYTHON_BIN_PATH is not executable.  Is it the python binary?"
-    exit 1
-  fi
-
-  local python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);')
-  if [ "$python_major_version" == "" ]; then
-    echo -e "\n\nERROR: Problem getting python version.  Is $PYTHON_BIN_PATH the correct python binary?"
-    exit 1
-  fi
-
-  # TODO(ngiraldo): confirm if these checks are really necessary, remove if not
-  if [ -z "$PYTHON_LIB_PATH" ]; then
-    local python_lib_path
-    # Split python_path into an array of paths, this allows path containing spaces
-    IFS=','
-    python_lib_path=($(python_path))
-    unset IFS
-
-    if [ 1 = "$USE_DEFAULT_PYTHON_LIB_PATH" ]; then
-      PYTHON_LIB_PATH="$(default_python_path "${python_lib_path[0]}")"
-      echo "Using python library path: $PYTHON_LIB_PATH"
-
-    else
-      echo "Found possible Python library paths:"
-      for x in "${python_lib_path[@]}"; do
-        echo "  $x"
-      done
-      set -- "${python_lib_path[@]}"
-      echo "Please input the desired Python library path to use.  Default is ["$1"]"
-      read b || true
-      if [ "$b" == "" ]; then
-        PYTHON_LIB_PATH="$(default_python_path "${python_lib_path[0]}")"
-        echo "Using python library path: $PYTHON_LIB_PATH"
-      else
-        PYTHON_LIB_PATH="$b"
-      fi
-    fi
-  fi
-
-  if test -d "$PYTHON_LIB_PATH" -a -x "$PYTHON_LIB_PATH"; then
-    python_lib="$PYTHON_LIB_PATH"
-  else
-    echo -e "\n\nERROR: Invalid python library path: ${PYTHON_LIB_PATH}."
-    exit 1
-  fi
-
-  # Convert python path to Windows style before writing into bazel.rc
-  if is_windows; then
-    PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")"
-  fi
-
-  # TODO(ngiraldo): move all below to root configure
-  # Write tools/bazel.rc
-  echo "# Autogenerated by configure: DO NOT EDIT" > tools/bazel.rc
-  sed -e "s/\$PYTHON_MAJOR_VERSION/$python_major_version/g" \
-      -e "s|\$PYTHON_BINARY|\"$PYTHON_BIN_PATH\"|g" \
-      tools/bazel.rc.template >> tools/bazel.rc
-  # Write tools/python_bin_path.sh
-  echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh
-}
-
-PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
-function is_windows() {
-  # On windows, the shell script is actually running in msys
-  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
-    true
-  else
-    false
-  fi
-}
-
-main "$@"